import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='whitegrid')/kaggle/input/house-prices-advanced-regression-techniques/train.csv
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from collections import Counter #for IQR method
from scipy.stats import median_abs_deviation #for modified z-score
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.feature_selection import f_regression, VarianceThreshold, SelectKBest, SelectPercentile, chi2, f_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
"""
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='whitegrid')
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from collections import Counter #for IQR method
from scipy.stats import median_abs_deviation #for modified z-score
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.feature_selection import f_regression, VarianceThreshold, SelectKBest, SelectPercentile, chi2, f_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
"""
#calculate adjusted r2
def m_adjusted_r2(obs_num, var_num,r2):
return (1-(1-r2)*((obs_num-1)/(obs_num-var_num-1)))
#show features regression in Dataframe
def m_fregression(x,y):
features_regression = pd.DataFrame(columns=['feature','coefficient','pval'])
for i in range (x.shape[1]):
df_f_regression = f_regression(x,y)
row = {'feature':x.columns[i],'coefficient':df_f_regression[0][i].round(2),'pval':df_f_regression[1][i].round(5)}
features_regression = features_regression.append(row,ignore_index=True)
return (features_regression)
#calculating multicolinearity between variables
def m_VIF(x,y):
#get columns names
a = x.columns
b = y.columns
#loop to generate a (specially-formated) string containing (dependant variable) and (independent variables)
string_fun = '{}~'.format(b[0])
for i in range(0,len(a),1):
string_fun = string_fun+"{}+".format(a[i])
#to drop the last (+)
string_fun= string_fun[0:len(string_fun)-1]
string_fun
#generate a full dataframe containing dependent and independent variables
df_vif_gen = pd.merge(left=y,right=x,left_index=True,right_index=True)
#find design matrix for regression model using 'rating' as response variable
y,x = dmatrices(string_fun, data=df_vif_gen, return_type='dataframe')
#create DataFrame to hold VIF values
vif_df = pd.DataFrame()
vif_df['variable'] = x.columns
#calculate VIF for each predictor variable
vif_df['VIF'] = np.round([variance_inflation_factor(x.values, i) for i in range(x.shape[1])],2)
#view VIF for each predictor variable
print ('VIF=1: There is no correlation between a given predictor variable and any other predictor variables in the model.\n')
print ('VIF=(1-5): There is moderate correlation between a given predictor variable and other predictor variables in the model.\n')
print ('VIF>5: There is severe correlation between a given predictor variable and other predictor variables in the model.')
return vif_df
# find correlated features
def m_correlation(dataset, threshold):
col_corr = [] # Set of all the names of correlated columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
col_corr.append(colname)
return col_corr
#reduce dataframe by quantile value (outlier), return the reduced dataframe and print the curve
def m_df_trim (dataframe,column_name,q_low,q_high):
q_low = dataframe[column_name].quantile(q=q_low)
q_high = dataframe[column_name].quantile(q=q_high)
df_reduced = dataframe[(dataframe[column_name]>q_low)&(dataframe[column_name]<=q_high)]
print (np.round(100-df_reduced.shape[0]/dataframe.shape[0]*100,2),"% of data will be lost")
fig, ax = plt.subplots(figsize=(15,5),nrows=1,ncols=2);
sns.histplot(data=dataframe,x=column_name,ax=ax[0],label='Original');
sns.histplot(data=df_reduced,x=column_name,ax=ax[1],label='Reduced')
ax[0].legend()
ax[1].legend()
return df_reduced
#creating different linear regression models and a dataframe containing the summary info
def m_mlinear_regression(xtr,xts,ytr,yts):
#create a dataframe for modeling summary
models_summary = pd.DataFrame(columns=['Model','Type','Scaled','Score'])
#multiple linear regression
model = LinearRegression()
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='Linear'
model_type='General'
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#support vector regression
model_variables=['rbf','linear','poly','sigmoid']
for i in model_variables:
model = SVR(kernel=i)
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='SVR'
model_type=i
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#Decision treeregression
model = DecisionTreeRegressor()
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='Decision tree'
model_type='General'
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#Random forest regression
model_variables = [100,200,300,400,500]
for i in model_variables:
model = RandomForestRegressor(n_estimators=i)
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='Random forest'
model_type=i
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#XGBoost
model = XGBRegressor()
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='XGBoost'
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#---------------SCALING-------------------#
sc = StandardScaler() #
xtr_sc= sc.fit_transform(xtr) #
xts_sc= sc.transform(xts) #
#-----------------------------------------#
#multiple linear regression
model = LinearRegression()
model.fit(xtr_sc,ytr)
ypr = model.predict(xts_sc)
#
model_name='Linear'
model_type='General'
scaled='Yes'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#support vector regression
model_variables=['rbf','linear','poly','sigmoid']
for i in model_variables:
model = SVR(kernel=i)
model.fit(xtr_sc,ytr)
ypr = model.predict(xts_sc)
#
model_name='SVR'
model_type=i
scaled='Yes'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#Decision treeregression
model = DecisionTreeRegressor()
model.fit(xtr_sc,ytr)
ypr = model.predict(xts_sc)
#
model_name='Decision tree'
model_type='General'
scaled='Yes'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#Random forest regression
model_variables = [100,200,300,400,500]
for i in model_variables:
model = RandomForestRegressor(n_estimators=i)
model.fit(xtr_sc,ytr)
ypr = model.predict(xts_sc)
#
model_name='Random forest'
model_type=i
scaled='Yes'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#XGBoost
model = XGBRegressor()
model.fit(xtr_sc,ytr)
ypr = model.predict(xts_sc)
#
model_name='XGBoost'
scaled='Yes'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
model_summary = models_summary.sort_values(by='Score',ascending=False)
model_summary.reset_index(inplace=True)
return model_summary
#return a list and graph of how features importance to target value
def m_feature_importance (xtrain,ytrain,n_estimators):
rf = RandomForestRegressor(n_estimators=100)
rf.fit(xtrain,ytrain)
df_features_importance = pd.DataFrame({'features':xtrain.columns,'importance':rf.feature_importances_})
df_features_importance.sort_values(by='importance',axis=0,inplace=True,ascending=False)
sns.barplot(x=df_features_importance['importance'],
y=df_features_importance['features'],
color='b')
return df_features_importance
#calculate adjusted r2
def m_adjusted_r2(obs_num, var_num,r2):
return (1-(1-r2)*((obs_num-1)/(obs_num-var_num-1)))
#show features regression in Dataframe
def m_fregression(x,y):
features_regression = pd.DataFrame(columns=['feature','coefficient','pval'])
for i in range (x.shape[1]):
df_f_regression = f_regression(x,y)
row = {'feature':x.columns[i],'coefficient':df_f_regression[0][i].round(2),'pval':df_f_regression[1][i].round(5)}
features_regression = features_regression.append(row,ignore_index=True)
return (features_regression)
#calculating multicolinearity between variables
def m_VIF(x,y):
#get columns names
a = x.columns
b = y.columns
#loop to generate a (specially-formated) string containing (dependant variable) and (independent variables)
string_fun = '{}~'.format(b[0])
for i in range(0,len(a),1):
string_fun = string_fun+"{}+".format(a[i])
#to drop the last (+)
string_fun= string_fun[0:len(string_fun)-1]
string_fun
#generate a full dataframe containing dependent and independent variables
df_vif_gen = pd.merge(left=y,right=x,left_index=True,right_index=True)
#find design matrix for regression model using 'rating' as response variable
y,x = dmatrices(string_fun, data=df_vif_gen, return_type='dataframe')
#create DataFrame to hold VIF values
vif_df = pd.DataFrame()
vif_df['variable'] = x.columns
#calculate VIF for each predictor variable
vif_df['VIF'] = np.round([variance_inflation_factor(x.values, i) for i in range(x.shape[1])],2)
#view VIF for each predictor variable
print ('VIF=1: There is no correlation between a given predictor variable and any other predictor variables in the model.\n')
print ('VIF=(1-5): There is moderate correlation between a given predictor variable and other predictor variables in the model.\n')
print ('VIF>5: There is severe correlation between a given predictor variable and other predictor variables in the model.')
return vif_df
# find correlated features
def m_correlation(dataset, threshold):
col_corr = [] # Set of all the names of correlated columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
col_corr.append(colname)
col_corr = list(set(col_corr))
return col_corr
#list the features which have correlation of specific values against the rest of dataset features
def m_corr_list(df,min_up=0.6,min_down=-0.6):
corr_mat=df.corr()
#loop through orrelation-matrix rows
for x in corr_mat.index:
#list for features having positive correlation against first feature
corr_feat_up=[]
#list for features having negative correlation against first feature
corr_feat_down=[]
#add features which have correlation in the specified limits
for y in corr_mat.columns:
if (corr_mat.loc[x,y]>=min_up) and (x!=y): #skip feature against same feature
corr_feat_up.append(y)
elif (corr_mat.loc[x,y]<=min_down) and (x!=y): #skip feature against same feature
corr_feat_down.append(y)
if len(corr_feat_up)!=0:
print ('\033[1m'+x+'\033[0m'+' is positively correlated with {}'.format(corr_feat_up))
if len(corr_feat_down)!=0:
print ('\033[1m'+x+'\033[0m'+' is negatively correlated with {}'.format(corr_feat_down))
if len(corr_feat_up)!=0 or len(corr_feat_down)!=0:
print ('')
#reduce dataframe by quantile value (outlier), return the reduced dataframe and print the curve
def m_df_trim (dataframe,column_name,q_low,q_high):
q_low = dataframe[column_name].quantile(q=q_low)
q_high = dataframe[column_name].quantile(q=q_high)
df_reduced = dataframe[(dataframe[column_name]>q_low)&(dataframe[column_name]<=q_high)]
print (np.round(100-df_reduced.shape[0]/dataframe.shape[0]*100,2),"% of data will be lost")
fig, ax = plt.subplots(figsize=(15,5),nrows=1,ncols=2);
sns.histplot(data=dataframe,x=column_name,ax=ax[0],label='Original');
sns.histplot(data=df_reduced,x=column_name,ax=ax[1],label='Reduced')
ax[0].legend()
ax[1].legend()
return df_reduced
#creating different linear regression models and a dataframe containing the summary info
def m_mlinear_regression(xtr,xts,ytr,yts):
#create a dataframe for modeling summary
models_summary = pd.DataFrame(columns=['Model','Type','Scaled','Score'])
#multiple linear regression
model = LinearRegression()
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='Linear'
model_type='General'
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#support vector regression
model_variables=['rbf','linear','poly','sigmoid']
for i in model_variables:
model = SVR(kernel=i)
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='SVR'
model_type=i
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#Decision treeregression
model = DecisionTreeRegressor()
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='Decision tree'
model_type='General'
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#Random forest regression
model_variables = [100,200,300,400,500]
for i in model_variables:
model = RandomForestRegressor(n_estimators=i)
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='Random forest'
model_type=i
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#XGBoost
model = XGBRegressor()
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='XGBoost'
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#GradientBoosting
model = GradientBoostingRegressor()
model.fit(xtr,ytr)
ypr = model.predict(xts)
#
model_name='GradientBoosting'
scaled='No'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#---------------SCALING-------------------#
sc = StandardScaler() #
xtr_sc= sc.fit_transform(xtr) #
xts_sc= sc.transform(xts) #
#-----------------------------------------#
#multiple linear regression
model = LinearRegression()
model.fit(xtr_sc,ytr)
ypr = model.predict(xts_sc)
#
model_name='Linear'
model_type='General'
scaled='Yes'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#support vector regression
model_variables=['rbf','linear','poly','sigmoid']
for i in model_variables:
model = SVR(kernel=i)
model.fit(xtr_sc,ytr)
ypr = model.predict(xts_sc)
#
model_name='SVR'
model_type=i
scaled='Yes'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#Decision treeregression
model = DecisionTreeRegressor()
model.fit(xtr_sc,ytr)
ypr = model.predict(xts_sc)
#
model_name='Decision tree'
model_type='General'
scaled='Yes'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#Random forest regression
model_variables = [100,200,300,400,500]
for i in model_variables:
model = RandomForestRegressor(n_estimators=i)
model.fit(xtr_sc,ytr)
ypr = model.predict(xts_sc)
#
model_name='Random forest'
model_type=i
scaled='Yes'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#XGBoost
model = XGBRegressor()
model.fit(xtr_sc,ytr)
ypr = model.predict(xts_sc)
#
model_name='XGBoost'
scaled='Yes'
score = r2_score(yts,ypr)
#plt.scatter(yts,ypr);
#plt.plot(yts,ypr,color='r');
models_summary = models_summary.append({'Model':model_name,'Type':model_type,'Scaled':scaled,'Score':score},ignore_index=True)
#GradientBoosting
model = GradientBoostingRegressor()
model.fit(xtr_sc,ytr)
model_summary = models_summary.sort_values(by='Score',ascending=False)
model_summary.reset_index(inplace=True)
return model_summary
#return a list and graph of how features importance to target value
def m_feature_importance (xtrain,ytrain,n_estimators):
rf = RandomForestRegressor(n_estimators=100)
rf.fit(xtrain,ytrain)
df_features_importance = pd.DataFrame({'features':xtrain.columns,'importance':rf.feature_importances_})
df_features_importance.sort_values(by='importance',axis=0,inplace=True,ascending=False)
sns.barplot(x=df_features_importance['importance'],
y=df_features_importance['features'],
color='b')
return df_features_importance
#split dataframe into target variable and features
def m_xy_split(df,target_column):
col = (df.columns).drop(target_column)
y = df[[target_column]]
x = df[col]
return x,y
#from collections import Counter #for IQR method
def m_outlier_iqr (df,n,features):
"""
Takes a dataframe and returns an index list corresponding to the observations
containing more than n outliers according to the Tukey IQR method.
"""
outlier_list = []
for column in features:
# 1st quartile (25%)
Q1 = np.percentile(df[column], 25)
# 3rd quartile (75%)
Q3 = np.percentile(df[column],75)
# Interquartile range (IQR)
IQR = Q3 - Q1
# outlier step
outlier_step = 1.5 * IQR
# Determining a list of indices of outliers
outlier_list_column = df[(df[column] < Q1 - outlier_step) | (df[column] > Q3 + outlier_step )].index
# appending the list of outliers
outlier_list.extend(outlier_list_column)
# selecting observations containing more than x outliers
outlier_list = Counter(outlier_list)
multiple_outliers = list( k for k, v in outlier_list.items() if v > n )
# Calculate the number of records below and above lower and above bound value respectively
df1 = df[df[column] < Q1 - outlier_step]
df2 = df[df[column] > Q3 + outlier_step]
print('Total number of outliers is:', df1.shape[0]+df2.shape[0])
print('Total number of observations containing more than {} outliers is: {} '.format(n,len(multiple_outliers)))
print('Percentage of data to be lost is: %{}'.format(np.round(len(multiple_outliers)/df.shape[0]*100,0)))
return multiple_outliers
def m_outlier_std (df,n,features):
"""
Takes a dataframe df of features and returns an index list corresponding to the observations
containing more than n outliers according to the standard deviation method.
"""
outlier_indices = []
for column in features:
# calculate the mean and standard deviation of the data frame
data_mean = df[column].mean()
data_std = df[column].std()
# calculate the cutoff value
cut_off = data_std * 3
# Determining a list of indices of outliers for feature column
outlier_list_column = df[(df[column] < data_mean - cut_off) | (df[column] > data_mean + cut_off)].index
# appending the found outlier indices for column to the list of outlier indices
outlier_indices.extend(outlier_list_column)
# selecting observations containing more than x outliers
outlier_indices = Counter(outlier_indices)
multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
# Calculate the number of records below and above lower and above bound value respectively
df1 = df[df[column] > data_mean + cut_off]
df2 = df[df[column] < data_mean - cut_off]
print('Total number of outliers is:', df1.shape[0]+ df2.shape[0])
print('Total number of observations containing more than {} outliers is: {} '.format(n,len(multiple_outliers)))
print('Percentage of data to be lost is: %{}'.format(np.round(len(multiple_outliers)/df.shape[0]*100,0)))
return multiple_outliers
#from scipy.stats import median_abs_deviation #for modified z-score
def m_outlier_zscore (df,n,features):
"""
Takes a dataframe df of features and returns an index list corresponding to the observations
containing more than n outliers according to the z-score method.
"""
outlier_list = []
for column in features:
# calculate the mean and standard deviation of the data frame
data_mean = df[column].mean()
data_std = df[column].std()
threshold = 3
z_score = abs( (df[column] - data_mean)/data_std )
# Determining a list of indices of outliers for feature column
outlier_list_column = df[z_score > threshold].index
# appending the found outlier indices for column to the list of outlier indices
outlier_list.extend(outlier_list_column)
# selecting observations containing more than x outliers
outlier_list = Counter(outlier_list)
multiple_outliers = list( k for k, v in outlier_list.items() if v > n )
# Calculate the number of outlier records
df1 = df[z_score > threshold]
print('Total number of outliers is:', df1.shape[0])
print('Total number of observations containing more than {} outliers is: {} '.format(n,len(multiple_outliers)))
print('Percentage of data to be lost is: %{}'.format(np.round(len(multiple_outliers)/df.shape[0]*100,0)))
return multiple_outliers
def m_outlier_zscore_modified (df,n,features):
"""
Takes a dataframe df of features and returns an index list corresponding to the observations
containing more than n outliers according to the z-score modified method.
"""
outlier_list = []
for column in features:
# calculate the mean and standard deviation of the data frame
data_mean = df[column].mean()
data_std = df[column].std()
threshold = 3
MAD = median_abs_deviation
mod_z_score = abs(0.6745*(df[column] - data_mean)/MAD(df[column]) )
# Determining a list of indices of outliers for feature column
outlier_list_column = df[mod_z_score >threshold].index
# appending the found outlier indices for column to the list of outlier indices
outlier_list.extend(outlier_list_column)
# selecting observations containing more than x outliers
outlier_list = Counter(outlier_list)
multiple_outliers = list( k for k, v in outlier_list.items() if v > n )
# Calculate the number of outlier records
df1 = df[mod_z_score >threshold]
print('Total number of outliers is:', df1.shape[0])
print('Total number of observations containing more than {} outliers is: {} '.format(n,len(multiple_outliers)))
print('Percentage of data to be lost is: %{}'.format(np.round(len(multiple_outliers)/df.shape[0]*100,0)))
return multiple_outliers
#from sklearn.ensemble import IsolationForest
def m_outlier_isolationforest(df,features=[]):
df=df[features]
outlier_model = IsolationForest()
outlier_model.fit(df)
anomaly = outlier_model.predict(df)
df['anomaly'] = anomaly
print ('The total number of outliers is: ',df[(df['anomaly']==-1)]['anomaly'].count())
print('Percentage of data to be lost is: %{}'.format(np.round(df[(df['anomaly']==-1)]['anomaly'].count()/df.shape[0]*100,0)))
return df
#from sklearn.cluster import DBSCAN
def m_outlier_DBSCAN(df,features=[]):
df=df[features]
sc = StandardScaler()
df_sc = sc.fit_transform(df)
model_DBSCAN = DBSCAN()
model_DBSCAN.fit(df_sc)
labels = model_DBSCAN.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
labels = pd.Series(labels)
print ('The total number of clusters without outliers: ',n_clusters)
print ('The number of outliers is: ',labels[(labels==-1)].count())
print('Percentage of data to be lost is: %{}'.format(np.round(labels[(labels==-1)].count()/df.shape[0]*100,0)))
df['label']=labels
return df
#perfrom all outliers detection methods
def m_outlier_all (df,n,features=[]):
print ("Method IQR:")
out_iqr = m_outlier_iqr(df,n=n,features=features)
print ("\nMethod Standard deviation:")
out_std = m_outlier_std(df,n=n,features=features)
print ("\nMethod Z-Score:")
out_zsc = m_outlier_zscore(df,n=n,features=features)
print ("\nMethod Modified Z-Score:")
out_mzs = m_outlier_zscore_modified(df,n=n,features=features)
print ("\nMethod Isolation forest:")
df_isf = m_outlier_isolationforest(df,features=features)
out_isf = list(df_isf[(df_isf['anomaly']==-1)].index)
print ("\nMethod DBSCAN:")
df_dbscan = m_outlier_DBSCAN(df,features=features)
out_dbs = list(df_dbscan[(df_dbscan['label']==-1)].index)
out_total = set(out_iqr).intersection(out_std,out_zsc,out_mzs,out_dbs,out_isf)
print (100*'-')
print('Total number of outliers is: {} '.format(len(out_total)))
print('Percentage of data to be lost is: %{}'.format(np.round(len(out_total)/df.shape[0]*100,0)))
return out_total
class m_describe:
def __init__(self,dataframe):
self.dataframe=df
def show_main (df,export_notes=False):
df=df
print ("Total number of features: ", len(df.columns))
print ("Total number of observations: ", len(df))
print ('-'*70)
print ('Total number of numerical values: ',len(list(df.select_dtypes(include=['int64', 'float64']).columns)))
print ('Total number of categorical values: ',len(list(df.select_dtypes(include=['object']).columns)))
print ('-'*70)
print ("Total number of missing values: ", df.isnull().sum().sum())
print ("Total number of duplicate rows: ", df.duplicated().sum())
plt.pie(x=[len(list(df.select_dtypes(include=['int64', 'float64']).columns)),
len(list(df.select_dtypes(include=['object']).columns))],
labels=['Numerical Features','Categorical Features'],autopct='%1.0f%%')
plt.title('Features Types')
#export features_notes
if export_notes==True:
x= list(df.select_dtypes(include=['object']).columns) #get list of numerical features
cat_df = pd.DataFrame(x, columns=['Feature']) #create dataframe containg numerical features
cat_df['Type']='Categorical'
y= list(df.select_dtypes(include=['int64', 'float64']).columns) #get list of categorical features
num_df = pd.DataFrame(y, columns=['Feature']) #create dataframe containing categorical features
num_df['Type']='Numerical'
fea_df = pd.concat([num_df, cat_df]) #merge above dataframes
fea_df[['Description','Importance','Observation','To Do']]=""
fea_df.to_excel('notes.xlsx',index=False)
del x,y,fea_df
#returns a list of numerical values
def num(df):
df=df
return list(df.select_dtypes(include=['int64', 'float64']).columns)
#returns a list of categorical values
def cat(df):
df=df
return list(df.select_dtypes(include=['object']).columns)
#returns a list of missing values sorted from high to low and categorized by features
def missing_val_list(df):
df=df
x= list(df.select_dtypes(include=['object']).columns) #get list of numerical features
cat_df = pd.DataFrame(x, columns=['Feature']) #create dataframe containg numerical features
cat_df['Type']='Categorical'
y= list(df.select_dtypes(include=['int64', 'float64']).columns) #get list of categorical features
num_df = pd.DataFrame(y, columns=['Feature']) #create dataframe containing categorical features
num_df['Type']='Numerical'
fea_df = pd.concat([num_df, cat_df]) #merge above dataframes
x= list(df.columns) #get list of all features
y= list(np.round(df.isnull().sum()/len(df)*100,0)) #get % of null values of all features
z= list((df.isnull().sum())) #get sum of null values of all features
nul_df = pd.DataFrame({'Feature':x,'count of nulls':z,'null % of total obesrvations':y}) #create dataframe from x,y,z
nul_df = pd.merge(fea_df,nul_df,on='Feature',how='outer') #merge all dataframes
nul_df=nul_df[(nul_df['count of nulls']>0)] #show only features having null values
nul_df.sort_values(by=['Type','null % of total obesrvations'],ascending=False) #sorting
nul_df=nul_df.reset_index(drop=True) #drop index
del x,y,z,cat_df,num_df #delete all temporary variables
return nul_df
#visualize numerical feauters
def visualize_num(df,
features='all',
figure_size=(15,150),color=sns.color_palette('Set2')[0]):
df=df
if features == 'all':
features=list(df.select_dtypes(include=['int64', 'float64']).columns)
fig, a=plt.subplots(nrows=len(features),ncols=2,figsize=figure_size)
for i in range(len(features)):
sns.histplot(data=df[features[i]],color=color,ax=a[i,0])
sns.boxplot(y=df[features[i]],color=color,ax=a[i,1])
del features
#return fig
#visualize categorical features
def visualize_cat(df,
features='all',
figure_size=(15,150), color_scheme='Set2'):
df=df
if features == 'all':
features=list(df.select_dtypes(include=['object']).columns)
fig, a=plt.subplots(nrows=len(features),ncols=2,figsize=figure_size)
for i in range (len(features)):
sorted_labels=df[features[i]].value_counts().sort_values(ascending=False).index
sns.countplot(df[features[i]],palette=color_scheme,order=sorted_labels,ax=a[i,0])
a[i,1].pie(df[features[i]].value_counts(),labels=list(sorted_labels),autopct='%1.0f%%',colors=sns.color_palette(sns.color_palette(color_scheme)));
del features, sorted_labels
#return fig
#show first
def show_category_concentraction(df,min_percent=0.95):
#create auxilary dataframe
x=pd.DataFrame(df.nunique(),columns=['Unique values'])
x=x.reset_index()
#add new column -feature
x=x.rename(columns={'index':'Feature'})
x= x.sort_values(by='Unique values',ascending=True)
#get list of features
list_of_features=list(x['Feature'].unique())
#loop through all feature and get percentage of data in the top category
for i in range(len(list_of_features)):
feature=list_of_features[i]
n_unique = df[feature].nunique()
value_counts = df[feature].value_counts()
percentage=np.round(value_counts.max() / value_counts.sum()*100)
x.loc[x['Feature']==feature, '1st category concentration -%'] = percentage
x=x[x['1st category concentration -%']/100>=min_percent]
x = x.sort_values(by=['Unique values','1st category concentration -%'],ascending=[True,False])
x=x.reset_index(drop=True)
return x
def m_scale_encode_split(dftrain,target_feature,num_feature,cat_feature,train_size=0.85,dftest=None):
#separating target variable and features
df=dftrain
y_ml=df[target_feature]
x_ml=df[num_feature+cat_feature]
#Encoding categorical features
x_ml_dum = pd.get_dummies(x_ml,columns=cat_feature,drop_first=True)
#splitting data
xtr,xts,ytr,yts=train_test_split(x_ml_dum,y_ml,train_size=train_size)
#Scaling numerical features
xtr_sc=xtr.copy()
xts_sc=xts.copy()
sc=StandardScaler()
xtr_sc[num_feature]=sc.fit_transform(xtr[num_feature])
xts_sc[num_feature]=sc.transform(xts[num_feature])
if dftest is None:
return xtr_sc,xts_sc,ytr,yts
else:
dfts=dftest[num_feature+cat_feature]
dfts_dum = pd.get_dummies(dfts,columns=cat_feature,drop_first=True)
dfts_sc=dfts_dum.copy()
dfts_sc[num_feature]=sc.transform(dfts_sc[num_feature])
return xtr_sc,xts_sc,ytr,yts,dfts_sc
df_raw = pd.read_csv('train.csv')
df_raw.head(5)
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
df_raw.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
m_describe.show_main(df_raw)
Total number of features: 81 Total number of observations: 1460 ---------------------------------------------------------------------- Total number of numerical values: 38 Total number of categorical values: 43 ---------------------------------------------------------------------- Total number of missing values: 6965 Total number of duplicate rows: 0
By examining features, we want to know how the following features can affect the sale price of property:
Also, we want to build a pricing model based on above features
m_describe.missing_val_list(df_raw)
| Feature | Type | count of nulls | null % of total obesrvations | |
|---|---|---|---|---|
| 0 | LotFrontage | Numerical | 259 | 18.0 |
| 1 | MasVnrArea | Numerical | 8 | 1.0 |
| 2 | GarageYrBlt | Numerical | 81 | 6.0 |
| 3 | Alley | Categorical | 1369 | 94.0 |
| 4 | MasVnrType | Categorical | 8 | 1.0 |
| 5 | BsmtQual | Categorical | 37 | 3.0 |
| 6 | BsmtCond | Categorical | 37 | 3.0 |
| 7 | BsmtExposure | Categorical | 38 | 3.0 |
| 8 | BsmtFinType1 | Categorical | 37 | 3.0 |
| 9 | BsmtFinType2 | Categorical | 38 | 3.0 |
| 10 | Electrical | Categorical | 1 | 0.0 |
| 11 | FireplaceQu | Categorical | 690 | 47.0 |
| 12 | GarageType | Categorical | 81 | 6.0 |
| 13 | GarageFinish | Categorical | 81 | 6.0 |
| 14 | GarageQual | Categorical | 81 | 6.0 |
| 15 | GarageCond | Categorical | 81 | 6.0 |
| 16 | PoolQC | Categorical | 1453 | 100.0 |
| 17 | Fence | Categorical | 1179 | 81.0 |
| 18 | MiscFeature | Categorical | 1406 | 96.0 |
df1 = df_raw.copy()
df1.drop(labels=['Alley','FireplaceQu','PoolQC','Fence','MiscFeature'],axis=1,inplace=True)
m_describe.show_main(df1)
Total number of features: 76 Total number of observations: 1460 ---------------------------------------------------------------------- Total number of numerical values: 38 Total number of categorical values: 38 ---------------------------------------------------------------------- Total number of missing values: 868 Total number of duplicate rows: 0
Let's examine categorical variables to figure what to do with missing values
m_describe.visualize_cat(df1)
observations in features ['Street','Utilities','Condition2','RoofMatl'] are concentrated in one category so they don't add new information to our data so I'll delete them
df2 = df1.copy()
df2.drop(labels=['Street','Utilities','Condition2','RoofMatl'],axis=1,inplace=True)
What about exploring numerical features
#Exploring numerical features
m_describe.visualize_num(df2)
Based on basic exploration of numerical features:
So, I will delete the above features
df3 = df2.copy()
labels=['Id','LowQualFinSF','BsmtHalfBath','KitchenAbvGr','EnclosedPorch','3SsnPorch',
'ScreenPorch','PoolArea','MiscVal','YrSold']
df3.drop(labels=labels,axis=1,inplace=True)
['MSSubClass','OverallQual','OverallCond','BsmtFullBath','FullBath','HalfBath','BedroomAbvGr','Fireplaces','GarageCars']
list_num= m_describe.num(df3)
list_cat= m_describe.cat(df3)
len(list_num)
28
len(list_cat)
34
cat_list=['MSSubClass','OverallQual','OverallCond','BsmtFullBath','FullBath','HalfBath','BedroomAbvGr','Fireplaces','GarageCars']
for cat in range(len(cat_list)):
list_num.remove(cat_list[cat])
list_cat.append(cat_list[cat])
len(list_num)
19
len(list_cat)
43
So in our reduced dataset, we have total 62 features (19 numerical and 43 categorical)
m_describe.missing_val_list(df3)
| Feature | Type | count of nulls | null % of total obesrvations | |
|---|---|---|---|---|
| 0 | LotFrontage | Numerical | 259 | 18.0 |
| 1 | MasVnrArea | Numerical | 8 | 1.0 |
| 2 | GarageYrBlt | Numerical | 81 | 6.0 |
| 3 | MasVnrType | Categorical | 8 | 1.0 |
| 4 | BsmtQual | Categorical | 37 | 3.0 |
| 5 | BsmtCond | Categorical | 37 | 3.0 |
| 6 | BsmtExposure | Categorical | 38 | 3.0 |
| 7 | BsmtFinType1 | Categorical | 37 | 3.0 |
| 8 | BsmtFinType2 | Categorical | 38 | 3.0 |
| 9 | Electrical | Categorical | 1 | 0.0 |
| 10 | GarageType | Categorical | 81 | 6.0 |
| 11 | GarageFinish | Categorical | 81 | 6.0 |
| 12 | GarageQual | Categorical | 81 | 6.0 |
| 13 | GarageCond | Categorical | 81 | 6.0 |
m_describe.visualize_num(df3,features=['LotFrontage','MasVnrArea','GarageYrBlt'],figure_size=(10,10))
#LotFrontage
# it's a normal distribution with outliers, so i'll fill missing values with median
df4 = df3.copy()
df4['LotFrontage'].fillna(df4['LotFrontage'].median(),inplace=True)
#GarageYrBlt
# I Think there is not a correct or wrong technique for filling missing values in such feature, So i will choose the medean method
df5=df4.copy()
df5['GarageYrBlt'].fillna(df5['GarageYrBlt'].median(),inplace=True)
#MasVnrArea
df3['MasVnrArea'].value_counts()[0]/df3['MasVnrArea'].value_counts().sum()*100
59.29752066115702
I will fill missing values with most frequently used observation
df6=df5.copy()
df6['MasVnrArea'].fillna(df6['MasVnrArea'].mode()[0],inplace=True)
df6['MasVnrArea'].isnull().sum()
0
m_describe.missing_val_list(df6)
| Feature | Type | count of nulls | null % of total obesrvations | |
|---|---|---|---|---|
| 0 | MasVnrType | Categorical | 8 | 1.0 |
| 1 | BsmtQual | Categorical | 37 | 3.0 |
| 2 | BsmtCond | Categorical | 37 | 3.0 |
| 3 | BsmtExposure | Categorical | 38 | 3.0 |
| 4 | BsmtFinType1 | Categorical | 37 | 3.0 |
| 5 | BsmtFinType2 | Categorical | 38 | 3.0 |
| 6 | Electrical | Categorical | 1 | 0.0 |
| 7 | GarageType | Categorical | 81 | 6.0 |
| 8 | GarageFinish | Categorical | 81 | 6.0 |
| 9 | GarageQual | Categorical | 81 | 6.0 |
| 10 | GarageCond | Categorical | 81 | 6.0 |
# I'll fill all missing values with mode
df7 = df6.copy()
df7.fillna(df7.mode().iloc[0],inplace=True)
There are many techniques to detect and handle outliers, I've created a code that runs 6 different outlier detection techniques and get a list of outliers which is confirmed by all these methods.
drop_list= m_outlier_all(df7,n=0,features=list_num)
Method IQR: Total number of outliers is: 61 Total number of observations containing more than 0 outliers is: 527 Percentage of data to be lost is: %36.0 Method Standard deviation: Total number of outliers is: 22 Total number of observations containing more than 0 outliers is: 179 Percentage of data to be lost is: %12.0 Method Z-Score: Total number of outliers is: 22 Total number of observations containing more than 0 outliers is: 179 Percentage of data to be lost is: %12.0 Method Modified Z-Score: Total number of outliers is: 56 Total number of observations containing more than 0 outliers is: 1460 Percentage of data to be lost is: %100.0 Method Isolation forest: The total number of outliers is: 118 Percentage of data to be lost is: %8.0 Method DBSCAN: The total number of clusters without outliers: 0 The number of outliers is: 1460 Percentage of data to be lost is: %100.0 ---------------------------------------------------------------------------------------------------- Total number of outliers is: 93 Percentage of data to be lost is: %6.0
df8=df7.copy()
df8.drop(labels=drop_list,axis=0,inplace=True)
m_describe.visualize_num(df7,features=list_num,figure_size=(9,45))
m_describe.visualize_num(df8,features=list_num,figure_size=(9,45))
I think that [LotArea] still has some outliers, I'll handle them again
drop_list = m_outlier_std(df8,features=['LotArea'],n=0)
Total number of outliers is: 17 Total number of observations containing more than 0 outliers is: 17 Percentage of data to be lost is: %1.0
df9 = df8.copy()
df9.drop(labels=drop_list,axis=0,inplace=True)
fig,ax = plt.subplots(1,2,figsize=(8,4))
sns.histplot(df8['LotArea'],ax=ax[0]);
sns.histplot(df9['LotArea'],ax=ax[1]);
fig,ax = plt.subplots(1,2,figsize=(8,4))
sns.boxplot(y= df8['LotArea'],ax=ax[0]);
sns.boxplot(y= df9['LotArea'],ax=ax[1]);
Let's check how much of data we lost after we completed out preprocessing stage
df9.shape[0]/df_raw.shape[0]*100
92.46575342465754
we lost 8% of data
# let's check heat map between numerical features
fig,ax= plt.subplots(1,1,figsize=(16,8))
sns.heatmap(data=df7[list_num].corr(),ax=ax,cmap='Blues',annot=True,fmt='.1g');
Too many variables to inspect, let's filter features which have high correlation
m_corr_list(df9[list_num],0.6,-0.6)
YearBuilt is positively correlated with ['YearRemodAdd', 'GarageYrBlt', 'SalePrice'] YearRemodAdd is positively correlated with ['YearBuilt', 'GarageYrBlt'] TotalBsmtSF is positively correlated with ['1stFlrSF', 'SalePrice'] 1stFlrSF is positively correlated with ['TotalBsmtSF'] 2ndFlrSF is positively correlated with ['GrLivArea'] GrLivArea is positively correlated with ['2ndFlrSF', 'TotRmsAbvGrd', 'SalePrice'] TotRmsAbvGrd is positively correlated with ['GrLivArea'] GarageYrBlt is positively correlated with ['YearBuilt', 'YearRemodAdd'] GarageArea is positively correlated with ['SalePrice'] SalePrice is positively correlated with ['YearBuilt', 'TotalBsmtSF', 'GrLivArea', 'GarageArea']
Let's check correlation between above variables
#YearBuilt
fig,ax=plt.subplots(nrows=1,ncols=3,figsize=(15,4))
sns.scatterplot(data=df9,x='YearBuilt',y='YearRemodAdd',ax=ax[0])
sns.scatterplot(data=df9,x='YearBuilt',y='GarageYrBlt',ax=ax[1])
sns.scatterplot(data=df9,x='YearBuilt',y='SalePrice',ax=ax[2]);
#Let's check after how many years, do people remodel their properties and it's effect of sale price
fig,ax=plt.subplots(1,3,figsize=(14,4))
maintenance_period = df9['YearRemodAdd']-df9['YearBuilt']
sns.boxplot(y=maintenance_period,ax=ax[0]);
sns.histplot(maintenance_period,ax=ax[1]);
sns.scatterplot(x=maintenance_period,y=df9['SalePrice'],ax=ax[2]);
maintenance_period.value_counts()
0 717
1 189
30 19
9 14
2 13
...
94 1
56 1
108 1
111 1
84 1
Length: 104, dtype: int64
#Let's when Garage are built
fig,ax=plt.subplots(1,3,figsize=(14,4))
garage_period = df9['GarageYrBlt']-df9['YearBuilt']
sns.boxplot(y=garage_period,ax=ax[0]);
sns.histplot(garage_period,ax=ax[1]);
sns.scatterplot(x=garage_period,y=df9['SalePrice'],ax=ax[2]);
x = pd.DataFrame(garage_period.value_counts()).reset_index()
x[(x['index']<0)][0].sum()
17
There is a positive correlation with high width between YearBuilt and SalePrice
In 53% of data, YearBuilt and YearRemodAdd are the same which is not logic, may be it’s used to describe the last time the property was built/renewed
In 60% of data, GarageYearBlt is the same as property yearbuilt which is logic. However, there are 17 observations where garage was built before propertty!
#TotalBsmtSF
fig,ax=plt.subplots(nrows=1,ncols=2,figsize=(15,4))
sns.scatterplot(data=df9,x='TotalBsmtSF',y='1stFlrSF',ax=ax[0])
sns.scatterplot(data=df9,x='TotalBsmtSF',y='SalePrice',ax=ax[1]);
#GrLivArea
fig,ax=plt.subplots(nrows=1,ncols=3,figsize=(15,4))
sns.scatterplot(data=df9,x='GrLivArea',y='2ndFlrSF',ax=ax[0])
sns.scatterplot(data=df9,x='GrLivArea',y='TotRmsAbvGrd',ax=ax[1])
sns.scatterplot(data=df9,x='GrLivArea',y='SalePrice',ax=ax[2]);
sns.scatterplot(df9['GarageYrBlt'],df9['SalePrice']);
fig,a=plt.subplots(1,2,figsize=(12,4))
sns.scatterplot(df9['YearBuilt'],df9['SalePrice'],ax=a[0]);
sns.scatterplot(df9['YearRemodAdd'],df9['SalePrice'],ax=a[1]);
I think YearRemodAdd will be more representative for SalePrice than YearBuilt I will drop the following columns: YearBuilt / GarageYrBlt since they don't add any new value to our dataset
df10=df9.copy()
df10.drop(labels=['YearBuilt','GarageYrBlt'],axis=1,inplace=True)
1- Does property type [MSSubClass] have an effect on [SalePrice]?
2- Relationship between MSZoning and SalePrice
3- Relationship between Neighborhood, MSSubClass, MSZoning and SalePrice
4- Relationship between LotArea, LotFrontage and SalePrice
5- Relationship between OverallQual and OverallCond
6- Relationship between OverallQual and OverallCond and YearBuilt and YearRemodAdd
7- When do they Remod the property
8- Relationship between YearBuilt / YearRemodAdd and SalePrice
9- Relationship between BsmtFinSF1 and BsntFinSF2 and BsmtUnfSF and TotalBsmtSF,
10- Is there any relation between Bsmt and LotArea or SalePrice
11- Do SalePrice affected by month sold over the years
12- Does Condition1 affect any feature?
13- RoofStyle and LotArea or Saleprice
14- SaleType and SaleCondition
15- Does Utilities like: Heating, HeatingQC, CentrailAir, Electrical affect SalePrice, SaleType?
16- Can we combine the above utilities feature into a new one?
sns.countplot(df9['MSSubClass'],order=df9['MSSubClass'].value_counts().index,color='Blue');
plt.pie(df9['MSSubClass'].value_counts(),labels=df9['MSSubClass'].value_counts().index,radius=2,autopct='%0.1f');
fig,ax=plt.subplots(1,1,figsize=(15,9))
sns.boxplot(data=df9,y='SalePrice',x='MSSubClass',ax=ax,order=df9['MSSubClass'].value_counts().index);
Answer
• 50% of observation are in category 20 and 60, and they contain most of outliers
• All categories share the same range of SalePrice
sns.countplot(df9['MSZoning'],order=df9['MSZoning'].value_counts().index,color='Blue');
plt.pie(df9['MSZoning'].value_counts(),labels=df9['MSZoning'].value_counts().index,radius=1,autopct='%0.1f');
fig,ax=plt.subplots(1,1,figsize=(15,9))
sns.boxplot(data=df9,y='SalePrice',x='MSZoning',ax=ax,order=df9['MSZoning'].value_counts().index);
Answer
• 93% of observation are on 2 categories RL and RM
• All categories share the same range of SalePrice
fig,ax=plt.subplots(1,1,figsize=(20,6))
sns.countplot(df9['Neighborhood'],order=df9['Neighborhood'].value_counts().index,color='Blue',ax=ax);
plt.pie(df9['Neighborhood'].value_counts(),labels=df9['Neighborhood'].value_counts().index,radius=3,autopct='%0.1f');
fig,ax=plt.subplots(1,1,figsize=(20,9))
sns.boxplot(data=df9,y='SalePrice',x='Neighborhood',ax=ax,order=df9['Neighborhood'].value_counts().index);
cont_table=pd.crosstab(index=df9['Neighborhood'],columns=df9['MSZoning'])
print (cont_table)
plt.figure(figsize=(12,8))
cont_table.plot(kind='bar',stacked=True);
MSZoning C (all) FV RH RL RM Neighborhood Blmngtn 0 0 0 16 1 Blueste 0 0 0 0 2 BrDale 0 0 0 0 16 BrkSide 0 0 0 27 30 ClearCr 0 0 0 13 0 CollgCr 0 0 0 138 10 Crawfor 0 0 2 41 3 Edwards 0 0 2 87 8 Gilbert 0 0 0 77 0 IDOTRR 8 0 0 0 28 MeadowV 0 0 0 0 16 Mitchel 0 0 0 40 5 NAmes 0 0 2 214 0 NPkVill 0 0 0 9 0 NWAmes 0 0 0 69 0 NoRidge 0 0 0 29 0 NridgHt 0 0 0 58 1 OldTown 1 0 0 17 88 SWISU 0 0 4 19 0 Sawyer 0 0 0 70 2 SawyerW 0 0 5 50 0 Somerst 0 64 0 20 0 StoneBr 0 0 0 20 0 Timber 0 0 0 30 0 Veenker 0 0 0 8 0
<Figure size 864x576 with 0 Axes>
Answer
• Neighborhoods [NridgHt and StoneBr and NoRidge] have high range of SalePrice and the rest of neighborhoods share the same range.
• MSZoning:
fig,a=plt.subplots(1,2,figsize=(12,6))
sns.scatterplot(data=df10,x='LotArea',y='SalePrice',ax=a[0],hue='Neighborhood');
sns.scatterplot(data=df10,x='LotFrontage',y='SalePrice',ax=a[1],hue='Neighborhood');
fig,a=plt.subplots(1,2,figsize=(12,6))
sns.scatterplot(data=df10,x='LotArea',y='SalePrice',ax=a[0],hue='BldgType');
sns.scatterplot(data=df10,x='LotFrontage',y='SalePrice',ax=a[1],hue='BldgType');
Answer
• In general, there is no correlation between LotArea, LotFrontage and SalePrice even if we show variations in Neighborhood or MSZoning
sns.scatterplot(data=df10,x='OverallQual',y='OverallCond');
Answer
• There is not relation between OverallQual and OverallCond
sns.scatterplot(data=df10,x='YearRemodAdd',y='OverallCond');
Answer
from heatmap and our analysis before we can notice the following:
• In 53% of data, YearBuilt and YearRemodAdd are the same which is not logic, may be it’s used to describe the last time the property was built/renewed
• Remod period ranges from 20 to 100 years
Answer
from heatmap and our analysis before we can notice the following:
• There is positive correlation between YearRemodAdd and SalePrice
• I think YearRemodAdd will be more representative for SalePrice than YearBuilt I will drop the following columns: YearBuilt / GarageYrBlt since they don't add any new value to our dataset
• Positive correlation with high width between YearBuilt and SalePrice
• Alarm: in 53% of data, YearBuilt and YearRemodAdd are the same which is not logic, may be it’s used to describe the last time the property was built/renewed
• In 60% of data, GarageYearBlt is the same as property yearbuilt which is logic. However, there are 17 observations where garage was built before property
#Examining Bsmt Features
sns.scatterplot(data=df10,x='BsmtFinSF1')
<AxesSubplot:xlabel='BsmtFinSF1'>
Answer
sns.scatterplot(df10['BsmtFinSF1'],df10['SalePrice']);
sns.scatterplot(df10['BsmtFinSF1'],df10['LotArea']);
Answer
• There is positive correlation between BsmtFinSF1 and SalePrice and no correlation to LotArea
sns.countplot(df10['MoSold'])
<AxesSubplot:xlabel='MoSold', ylabel='count'>
Answer
• Most of sales are done in 2nd and 3rd quarter of the year
plt.figure(figsize=(10,6))
sns.scatterplot(df10['SaleType'],df10['SalePrice'],hue=df10['SaleCondition'])
<AxesSubplot:xlabel='SaleType', ylabel='SalePrice'>
Answer
• It’s important for our model I think
Answer
• no obvious relation
Answer
• Only Salecondition: new are sold partially
fig,a = plt.subplots(1,2,figsize=(10,4))
sns.scatterplot(data=df10,x='Heating',y='SalePrice',ax=a[0],hue='HeatingQC')
sns.scatterplot(data=df10,x='HeatingQC',y='SalePrice',ax=a[1])
<AxesSubplot:xlabel='HeatingQC', ylabel='SalePrice'>
sns.scatterplot(data=df10,x='CentralAir',y='SalePrice',hue='Heating')
<AxesSubplot:xlabel='CentralAir', ylabel='SalePrice'>
sns.scatterplot(data=df10,x='Electrical',y='SalePrice',hue='CentralAir')
<AxesSubplot:xlabel='Electrical', ylabel='SalePrice'>
Answer
• Heating & HeatingQC: other than GasA, you have a very limited options in saleprice
• CentralAir: if property doesn’t have centralAir, it will not have GasA heating and it’s price will drop dramatically
• Electrical: Saleprice will drop if electrical system is other than SBrkr
Answer
df10.columns
Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape',
'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearRemodAdd',
'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageType',
'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'SaleType',
'SaleCondition', 'SalePrice'],
dtype='object')
list_num=m_describe.num(df10)
list_cat=m_describe.cat(df10)
list_num
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'SalePrice']
#changing the target variable column [MEDV] to be the first column
col_raw_ordered = ['SalePrice','MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearRemodAdd', 'MasVnrArea',
'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'MoSold',]
df11_num = df10[col_raw_ordered]
df11_cat = df10[cat_list]
#Starting with numerical values
y_raw = df11_num[col_raw_ordered[0:1]]
x_raw = df11_num[col_raw_ordered[1:]]
x_raw_tr,x_raw_ts, y_raw_tr,y_raw_ts = train_test_split(x_raw,y_raw,train_size=0.9)
m_mlinear_regression(x_raw_tr,x_raw_ts,y_raw_tr,y_raw_ts)
| index | Model | Type | Scaled | Score | |
|---|---|---|---|---|---|
| 0 | 12 | GradientBoosting | 500 | No | 0.902432 |
| 1 | 23 | Random forest | 500 | Yes | 0.893437 |
| 2 | 10 | Random forest | 500 | No | 0.891533 |
| 3 | 21 | Random forest | 300 | Yes | 0.891109 |
| 4 | 9 | Random forest | 400 | No | 0.890923 |
| 5 | 7 | Random forest | 200 | No | 0.889693 |
| 6 | 6 | Random forest | 100 | No | 0.888806 |
| 7 | 22 | Random forest | 400 | Yes | 0.888531 |
| 8 | 20 | Random forest | 200 | Yes | 0.888022 |
| 9 | 8 | Random forest | 300 | No | 0.887878 |
| 10 | 19 | Random forest | 100 | Yes | 0.887338 |
| 11 | 13 | Linear | General | Yes | 0.878505 |
| 12 | 0 | Linear | General | No | 0.878505 |
| 13 | 11 | XGBoost | 500 | No | 0.869463 |
| 14 | 24 | XGBoost | 500 | Yes | 0.869294 |
| 15 | 2 | SVR | linear | No | 0.827006 |
| 16 | 18 | Decision tree | General | Yes | 0.707237 |
| 17 | 5 | Decision tree | General | No | 0.699308 |
| 18 | 15 | SVR | linear | Yes | 0.138306 |
| 19 | 17 | SVR | sigmoid | Yes | -0.015275 |
| 20 | 3 | SVR | poly | No | -0.016327 |
| 21 | 16 | SVR | poly | Yes | -0.019008 |
| 22 | 14 | SVR | rbf | Yes | -0.019509 |
| 23 | 1 | SVR | rbf | No | -0.021247 |
| 24 | 4 | SVR | sigmoid | No | -0.021660 |
Without any feature engineering yet and modelling only numerical values, we created a model with an accuracy of >80%.
Best models so far are Random forest (scaled) / GradientBoosting / Linear / XGBoost
#checking constant variables
selector_const = VarianceThreshold(threshold=0.0)
selector_const.fit(x_raw_tr)
VarianceThreshold()
selector_const.get_support().sum(), x_raw_tr.shape[1]
(25, 25)
There are no constant features
#checking quasi constant variables
selector_quasi = VarianceThreshold(threshold=0.01)
selector_quasi.fit(x_raw_tr)
VarianceThreshold(threshold=0.01)
selector_quasi.get_support().sum(), x_raw_tr.shape[1]
(25, 25)
There are no semi constant features
#creating features and variance list
features_variance = {'feature':list(x_raw_tr.columns),'feature_variance':list(selector_quasi.get_support())}
df_features_variance = pd.DataFrame(features_variance)
df_features_variance[(df_features_variance['feature_variance']==False)]
| feature | feature_variance |
|---|
m_correlation(x_raw_tr,0.8)
['GarageArea', 'TotRmsAbvGrd']
There is not a heavy correlation between ['TotRmsAbvGrd', 'GarageArea']
sns.scatterplot(df10['TotRmsAbvGrd'],df10['GarageArea'])
<AxesSubplot:xlabel='TotRmsAbvGrd', ylabel='GarageArea'>
m_VIF(x_raw_tr,y_raw_tr)
Traceback (most recent call last): File "C:\Users\Mahmoud Shahwan\AppData\Local\Programs\Python\Python310\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "C:\Users\MAHMOU~1\AppData\Local\Temp/ipykernel_17644/2427951001.py", line 1, in <module> m_VIF(x_raw_tr,y_raw_tr) File "C:\Users\MAHMOU~1\AppData\Local\Temp/ipykernel_17644/1115767830.py", line 306, in m_VIF y,x = dmatrices(string_fun, data=df_vif_gen, return_type='dataframe') File "C:\Users\Mahmoud Shahwan\AppData\Local\Programs\Python\Python310\lib\site-packages\patsy\highlevel.py", line 309, in dmatrices (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, File "C:\Users\Mahmoud Shahwan\AppData\Local\Programs\Python\Python310\lib\site-packages\patsy\highlevel.py", line 164, in _do_highlevel_design design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, File "C:\Users\Mahmoud Shahwan\AppData\Local\Programs\Python\Python310\lib\site-packages\patsy\highlevel.py", line 66, in _try_incr_builders return design_matrix_builders([formula_like.lhs_termlist, File "C:\Users\Mahmoud Shahwan\AppData\Local\Programs\Python\Python310\lib\site-packages\patsy\build.py", line 689, in design_matrix_builders factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env) File "C:\Users\Mahmoud Shahwan\AppData\Local\Programs\Python\Python310\lib\site-packages\patsy\build.py", line 354, in _factors_memorize which_pass = factor.memorize_passes_needed(state, eval_env) File "C:\Users\Mahmoud Shahwan\AppData\Local\Programs\Python\Python310\lib\site-packages\patsy\eval.py", line 474, in memorize_passes_needed subset_names = [name for name in ast_names(self.code) File "C:\Users\Mahmoud Shahwan\AppData\Local\Programs\Python\Python310\lib\site-packages\patsy\eval.py", line 474, in <listcomp> subset_names = [name for name in ast_names(self.code) File "C:\Users\Mahmoud Shahwan\AppData\Local\Programs\Python\Python310\lib\site-packages\patsy\eval.py", line 105, in ast_names for node in ast.walk(ast.parse(code)): File "C:\Users\Mahmoud Shahwan\AppData\Local\Programs\Python\Python310\lib\ast.py", line 50, in parse return compile(source, filename, mode, flags, File "<unknown>", line 1 1 stFlrSF ^ SyntaxError: invalid syntax
m_feature_importance(x_raw_tr,y_raw_tr,100)
| features | importance | |
|---|---|---|
| 3 | OverallQual | 0.593498 |
| 13 | GrLivArea | 0.103954 |
| 7 | BsmtFinSF1 | 0.051249 |
| 10 | TotalBsmtSF | 0.050500 |
| 21 | GarageArea | 0.036426 |
| 20 | GarageCars | 0.023829 |
| 5 | YearRemodAdd | 0.022984 |
| 11 | 1stFlrSF | 0.017105 |
| 2 | LotArea | 0.016082 |
| 4 | OverallCond | 0.009390 |
| 1 | LotFrontage | 0.009105 |
| 9 | BsmtUnfSF | 0.008239 |
| 23 | OpenPorchSF | 0.008232 |
| 12 | 2ndFlrSF | 0.007807 |
| 6 | MasVnrArea | 0.007026 |
| 15 | FullBath | 0.007017 |
| 24 | MoSold | 0.005355 |
| 22 | WoodDeckSF | 0.004828 |
| 0 | MSSubClass | 0.004621 |
| 19 | Fireplaces | 0.004207 |
| 18 | TotRmsAbvGrd | 0.003039 |
| 14 | BsmtFullBath | 0.001894 |
| 17 | BedroomAbvGr | 0.001526 |
| 16 | HalfBath | 0.001526 |
| 8 | BsmtFinSF2 | 0.000564 |
# let's drop the following columns and check the model
df12=df11_num.copy()
drop_list= ['GarageArea','BsmtFinSF2','BsmtFullBath','HalfBath','BedroomAbvGr','FullBath','WoodDeckSF','MoSold']
df12.drop(labels=drop_list,axis=1,inplace=True)
col_12=list(df12.columns)
y_12 = df12[col_12[0:1]]
x_12 = df12[col_12[1:]]
x_12_tr,x_12_ts, y_12_tr,y_12_ts = train_test_split(x_12,y_12,train_size=0.9)
m_mlinear_regression(x_12_tr,x_12_ts,y_12_tr,y_12_ts)
| index | Model | Type | Scaled | Score | |
|---|---|---|---|---|---|
| 0 | 12 | GradientBoosting | 500 | No | 0.855188 |
| 1 | 11 | XGBoost | 500 | No | 0.829923 |
| 2 | 24 | XGBoost | 500 | Yes | 0.829861 |
| 3 | 20 | Random forest | 200 | Yes | 0.828376 |
| 4 | 9 | Random forest | 400 | No | 0.827057 |
| 5 | 19 | Random forest | 100 | Yes | 0.826137 |
| 6 | 10 | Random forest | 500 | No | 0.825910 |
| 7 | 22 | Random forest | 400 | Yes | 0.823819 |
| 8 | 7 | Random forest | 200 | No | 0.823366 |
| 9 | 21 | Random forest | 300 | Yes | 0.822218 |
| 10 | 8 | Random forest | 300 | No | 0.821454 |
| 11 | 23 | Random forest | 500 | Yes | 0.820810 |
| 12 | 6 | Random forest | 100 | No | 0.820345 |
| 13 | 0 | Linear | General | No | 0.804493 |
| 14 | 13 | Linear | General | Yes | 0.804493 |
| 15 | 2 | SVR | linear | No | 0.750263 |
| 16 | 18 | Decision tree | General | Yes | 0.736065 |
| 17 | 5 | Decision tree | General | No | 0.727493 |
| 18 | 15 | SVR | linear | Yes | 0.052501 |
| 19 | 3 | SVR | poly | No | -0.056864 |
| 20 | 17 | SVR | sigmoid | Yes | -0.062723 |
| 21 | 16 | SVR | poly | Yes | -0.064337 |
| 22 | 14 | SVR | rbf | Yes | -0.067486 |
| 23 | 1 | SVR | rbf | No | -0.068367 |
| 24 | 4 | SVR | sigmoid | No | -0.069442 |
our models didn't perfrom worse and didn't improve very much, let's drop more features
m_feature_importance(x_12_tr,y_12_tr,100)
| features | importance | |
|---|---|---|
| 3 | OverallQual | 0.603452 |
| 12 | GrLivArea | 0.120890 |
| 7 | BsmtFinSF1 | 0.063655 |
| 9 | TotalBsmtSF | 0.052792 |
| 5 | YearRemodAdd | 0.026932 |
| 15 | GarageCars | 0.026667 |
| 2 | LotArea | 0.019411 |
| 10 | 1stFlrSF | 0.018058 |
| 1 | LotFrontage | 0.012134 |
| 8 | BsmtUnfSF | 0.010128 |
| 4 | OverallCond | 0.010096 |
| 16 | OpenPorchSF | 0.008560 |
| 6 | MasVnrArea | 0.007439 |
| 11 | 2ndFlrSF | 0.007043 |
| 14 | Fireplaces | 0.005129 |
| 0 | MSSubClass | 0.004794 |
| 13 | TotRmsAbvGrd | 0.002821 |
Let's try dropping features ['TotRmsAbvGrd','MSSubClass','Fireplaces','2ndFlrSF','MasVnrArea','OpenPorchSF']
# let's drop the following columns and check the model
df13=df12.copy()
drop_list= ['TotRmsAbvGrd','MSSubClass','Fireplaces','2ndFlrSF','MasVnrArea','OpenPorchSF']
df13.drop(labels=drop_list,axis=1,inplace=True)
col_13=list(df13.columns)
y_13 = df13[col_13[0:1]]
x_13 = df13[col_13[1:]]
x_13_tr,x_13_ts, y_13_tr,y_13_ts = train_test_split(x_13,y_13,train_size=0.9)
m_mlinear_regression(x_13_tr,x_13_ts,y_13_tr,y_13_ts)
| index | Model | Type | Scaled | Score | |
|---|---|---|---|---|---|
| 0 | 12 | GradientBoosting | 500 | No | 0.900764 |
| 1 | 6 | Random forest | 100 | No | 0.899507 |
| 2 | 19 | Random forest | 100 | Yes | 0.898567 |
| 3 | 22 | Random forest | 400 | Yes | 0.898156 |
| 4 | 21 | Random forest | 300 | Yes | 0.898126 |
| 5 | 9 | Random forest | 400 | No | 0.897912 |
| 6 | 10 | Random forest | 500 | No | 0.897076 |
| 7 | 20 | Random forest | 200 | Yes | 0.896969 |
| 8 | 8 | Random forest | 300 | No | 0.896817 |
| 9 | 23 | Random forest | 500 | Yes | 0.896727 |
| 10 | 7 | Random forest | 200 | No | 0.896381 |
| 11 | 11 | XGBoost | 500 | No | 0.884948 |
| 12 | 24 | XGBoost | 500 | Yes | 0.884916 |
| 13 | 0 | Linear | General | No | 0.880498 |
| 14 | 13 | Linear | General | Yes | 0.880498 |
| 15 | 2 | SVR | linear | No | 0.777130 |
| 16 | 18 | Decision tree | General | Yes | 0.773413 |
| 17 | 5 | Decision tree | General | No | 0.773084 |
| 18 | 15 | SVR | linear | Yes | 0.058699 |
| 19 | 16 | SVR | poly | Yes | -0.024337 |
| 20 | 3 | SVR | poly | No | -0.025856 |
| 21 | 17 | SVR | sigmoid | Yes | -0.026622 |
| 22 | 14 | SVR | rbf | Yes | -0.031286 |
| 23 | 1 | SVR | rbf | No | -0.032721 |
| 24 | 4 | SVR | sigmoid | No | -0.033067 |
our models didn't perfrom worse and didn't improve very much, let's drop more features : ['OverallCond','BsmtUnfSF','LotFrontage','1stFlrSF']
# let's drop the following columns and check the model
df14=df13.copy()
drop_list= ['OverallCond','BsmtUnfSF','LotFrontage','1stFlrSF']
df14.drop(labels=drop_list,axis=1,inplace=True)
col_14=list(df14.columns)
y_14 = df14[col_14[0:1]]
x_14 = df14[col_14[1:]]
x_14.head(2)
| LotArea | OverallQual | YearRemodAdd | BsmtFinSF1 | TotalBsmtSF | GrLivArea | GarageCars | |
|---|---|---|---|---|---|---|---|
| 0 | 8450 | 7 | 2003 | 706 | 856 | 1710 | 2 |
| 1 | 9600 | 6 | 1976 | 978 | 1262 | 1262 | 2 |
x_14_tr,x_14_ts, y_14_tr,y_14_ts = train_test_split(x_14,y_14,train_size=0.9)
m_mlinear_regression(x_14_tr,x_14_ts,y_14_tr,y_14_ts)
| index | Model | Type | Scaled | Score | |
|---|---|---|---|---|---|
| 0 | 8 | Random forest | 300 | No | 0.865158 |
| 1 | 23 | Random forest | 500 | Yes | 0.864834 |
| 2 | 7 | Random forest | 200 | No | 0.863494 |
| 3 | 19 | Random forest | 100 | Yes | 0.862940 |
| 4 | 12 | GradientBoosting | 500 | No | 0.862879 |
| 5 | 10 | Random forest | 500 | No | 0.862679 |
| 6 | 22 | Random forest | 400 | Yes | 0.862280 |
| 7 | 9 | Random forest | 400 | No | 0.861905 |
| 8 | 21 | Random forest | 300 | Yes | 0.861078 |
| 9 | 6 | Random forest | 100 | No | 0.859666 |
| 10 | 20 | Random forest | 200 | Yes | 0.857477 |
| 11 | 24 | XGBoost | 500 | Yes | 0.842095 |
| 12 | 11 | XGBoost | 500 | No | 0.842057 |
| 13 | 0 | Linear | General | No | 0.833490 |
| 14 | 13 | Linear | General | Yes | 0.833490 |
| 15 | 2 | SVR | linear | No | 0.712274 |
| 16 | 18 | Decision tree | General | Yes | 0.683998 |
| 17 | 5 | Decision tree | General | No | 0.676579 |
| 18 | 15 | SVR | linear | Yes | 0.004919 |
| 19 | 16 | SVR | poly | Yes | -0.055906 |
| 20 | 3 | SVR | poly | No | -0.059716 |
| 21 | 17 | SVR | sigmoid | Yes | -0.064312 |
| 22 | 14 | SVR | rbf | Yes | -0.071412 |
| 23 | 1 | SVR | rbf | No | -0.073059 |
| 24 | 4 | SVR | sigmoid | No | -0.074296 |
so far our model isn't performing worse by removing features and making it more simple
m_feature_importance(x_14_tr,y_14_tr,100)
| features | importance | |
|---|---|---|
| 1 | OverallQual | 0.614199 |
| 5 | GrLivArea | 0.130794 |
| 4 | TotalBsmtSF | 0.080351 |
| 3 | BsmtFinSF1 | 0.063637 |
| 2 | YearRemodAdd | 0.040657 |
| 0 | LotArea | 0.037097 |
| 6 | GarageCars | 0.033265 |
# let's drop the following columns and check the model
df15=df14.copy()
drop_list= ['BsmtFinSF1','GarageCars']
df15.drop(labels=drop_list,axis=1,inplace=True)
col_15=list(df15.columns)
y_15 = df14[col_15[0:1]]
x_15 = df14[col_15[1:]]
x_15.head(2)
| LotArea | OverallQual | YearRemodAdd | TotalBsmtSF | GrLivArea | |
|---|---|---|---|---|---|
| 0 | 8450 | 7 | 2003 | 856 | 1710 |
| 1 | 9600 | 6 | 1976 | 1262 | 1262 |
x_15_tr,x_15_ts, y_15_tr,y_15_ts = train_test_split(x_15,y_15,train_size=0.9)
m_mlinear_regression(x_15_tr,x_15_ts,y_15_tr,y_15_ts)
| index | Model | Type | Scaled | Score | |
|---|---|---|---|---|---|
| 0 | 10 | Random forest | 500 | No | 0.903119 |
| 1 | 22 | Random forest | 400 | Yes | 0.901671 |
| 2 | 23 | Random forest | 500 | Yes | 0.901059 |
| 3 | 8 | Random forest | 300 | No | 0.900711 |
| 4 | 21 | Random forest | 300 | Yes | 0.900632 |
| 5 | 20 | Random forest | 200 | Yes | 0.900537 |
| 6 | 7 | Random forest | 200 | No | 0.899549 |
| 7 | 9 | Random forest | 400 | No | 0.899417 |
| 8 | 12 | GradientBoosting | 500 | No | 0.898244 |
| 9 | 6 | Random forest | 100 | No | 0.897216 |
| 10 | 19 | Random forest | 100 | Yes | 0.896225 |
| 11 | 11 | XGBoost | 500 | No | 0.864287 |
| 12 | 13 | Linear | General | Yes | 0.864220 |
| 13 | 0 | Linear | General | No | 0.864220 |
| 14 | 24 | XGBoost | 500 | Yes | 0.863827 |
| 15 | 18 | Decision tree | General | Yes | 0.804277 |
| 16 | 5 | Decision tree | General | No | 0.797608 |
| 17 | 2 | SVR | linear | No | 0.765371 |
| 18 | 15 | SVR | linear | Yes | -0.053179 |
| 19 | 16 | SVR | poly | Yes | -0.100683 |
| 20 | 3 | SVR | poly | No | -0.106841 |
| 21 | 17 | SVR | sigmoid | Yes | -0.113306 |
| 22 | 14 | SVR | rbf | Yes | -0.121019 |
| 23 | 1 | SVR | rbf | No | -0.122994 |
| 24 | 4 | SVR | sigmoid | No | -0.124439 |
the model started to perform worse, so I will neglect the last change and start to improve it by adding categorical variables
#i tried with differet n_components but it didn't improve my model
pca =PCA(n_components=5).fit(x_raw_tr)
x_pca_tr=pca.transform(x_raw_tr)
x_pca_ts=pca.transform(x_raw_ts)
m_mlinear_regression(x_pca_tr,x_pca_ts,y_raw_tr,y_raw_ts)
| index | Model | Type | Scaled | Score | |
|---|---|---|---|---|---|
| 0 | 12 | GradientBoosting | 500 | No | 0.780768 |
| 1 | 19 | Random forest | 100 | Yes | 0.771952 |
| 2 | 22 | Random forest | 400 | Yes | 0.761053 |
| 3 | 8 | Random forest | 300 | No | 0.759499 |
| 4 | 23 | Random forest | 500 | Yes | 0.758470 |
| 5 | 9 | Random forest | 400 | No | 0.758236 |
| 6 | 7 | Random forest | 200 | No | 0.758188 |
| 7 | 21 | Random forest | 300 | Yes | 0.756034 |
| 8 | 10 | Random forest | 500 | No | 0.755593 |
| 9 | 20 | Random forest | 200 | Yes | 0.751808 |
| 10 | 11 | XGBoost | 500 | No | 0.746493 |
| 11 | 24 | XGBoost | 500 | Yes | 0.746493 |
| 12 | 6 | Random forest | 100 | No | 0.743414 |
| 13 | 0 | Linear | General | No | 0.710513 |
| 14 | 13 | Linear | General | Yes | 0.710513 |
| 15 | 2 | SVR | linear | No | 0.702248 |
| 16 | 18 | Decision tree | General | Yes | 0.507148 |
| 17 | 5 | Decision tree | General | No | 0.494842 |
| 18 | 3 | SVR | poly | No | -0.000139 |
| 19 | 15 | SVR | linear | Yes | -0.000242 |
| 20 | 16 | SVR | poly | Yes | -0.017568 |
| 21 | 17 | SVR | sigmoid | Yes | -0.017613 |
| 22 | 14 | SVR | rbf | Yes | -0.019767 |
| 23 | 4 | SVR | sigmoid | No | -0.019847 |
| 24 | 1 | SVR | rbf | No | -0.020681 |
lda = LinearDiscriminantAnalysis()
lda.fit_transform(x_raw_tr,y_raw_tr)
x_lda_tr=lda.transform(x_raw_tr)
x_lda_ts=lda.transform(x_raw_ts)
m_mlinear_regression(x_lda_tr,x_lda_ts,y_raw_tr,y_raw_ts)
| index | Model | Type | Scaled | Score | |
|---|---|---|---|---|---|
| 0 | 24 | XGBoost | 500 | Yes | 0.909980 |
| 1 | 11 | XGBoost | 500 | No | 0.909980 |
| 2 | 20 | Random forest | 200 | Yes | 0.901626 |
| 3 | 6 | Random forest | 100 | No | 0.901198 |
| 4 | 23 | Random forest | 500 | Yes | 0.900538 |
| 5 | 8 | Random forest | 300 | No | 0.900257 |
| 6 | 19 | Random forest | 100 | Yes | 0.899189 |
| 7 | 10 | Random forest | 500 | No | 0.898668 |
| 8 | 9 | Random forest | 400 | No | 0.898219 |
| 9 | 22 | Random forest | 400 | Yes | 0.898023 |
| 10 | 21 | Random forest | 300 | Yes | 0.896624 |
| 11 | 7 | Random forest | 200 | No | 0.895493 |
| 12 | 12 | GradientBoosting | 500 | No | 0.892298 |
| 13 | 13 | Linear | General | Yes | 0.878505 |
| 14 | 0 | Linear | General | No | 0.878505 |
| 15 | 18 | Decision tree | General | Yes | 0.794361 |
| 16 | 5 | Decision tree | General | No | 0.763956 |
| 17 | 2 | SVR | linear | No | 0.224913 |
| 18 | 15 | SVR | linear | Yes | 0.009137 |
| 19 | 4 | SVR | sigmoid | No | -0.014029 |
| 20 | 3 | SVR | poly | No | -0.018283 |
| 21 | 1 | SVR | rbf | No | -0.019290 |
| 22 | 17 | SVR | sigmoid | Yes | -0.020399 |
| 23 | 14 | SVR | rbf | Yes | -0.021029 |
| 24 | 16 | SVR | poly | Yes | -0.021490 |
Let's try to add some important categorical features and see if it improves our model
cat_features=['MSZoning','LotShape','LandContour','LotConfig','LandSlope','Neighborhood','Condition1','BldgType','HouseStyle',
'RoofStyle','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure',
'BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','GarageType',
'GarageFinish','GarageQual','GarageCond','PavedDrive','SaleType','SaleCondition']
for feat in list_cat:
col = list(df14.columns)
col.append(feat)
df16 = df10[col]
dum_df = pd.get_dummies(df16[feat],drop_first=True)
df16_dum = pd.concat([df16,dum_df],axis=1)
df16_dum=df16_dum.drop(labels=[feat],axis=1)
col_16= list(df16_dum.columns)
y_16=df16_dum[col_16[0:1]]
x_16=df16_dum[col_16[1:]]
x_16_tr,x_16_ts,y_16_tr,y_16_ts=train_test_split(x_16,y_16,train_size=0.9)
print ('')
print (feat)
print (m_mlinear_regression(x_16_tr,x_16_ts,y_16_tr,y_16_ts))
MSZoning
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.906894
1 19 Random forest 100 Yes 0.904173
2 9 Random forest 400 No 0.903847
3 22 Random forest 400 Yes 0.903111
4 23 Random forest 500 Yes 0.902808
5 10 Random forest 500 No 0.902224
6 21 Random forest 300 Yes 0.901670
7 8 Random forest 300 No 0.900937
8 20 Random forest 200 Yes 0.900689
9 7 Random forest 200 No 0.898641
10 6 Random forest 100 No 0.895859
11 24 XGBoost 500 Yes 0.879593
12 11 XGBoost 500 No 0.879066
13 13 Linear General Yes 0.862381
14 0 Linear General No 0.862381
15 2 SVR linear No 0.769801
16 5 Decision tree General No 0.764069
17 18 Decision tree General Yes 0.763223
18 15 SVR linear Yes 0.024517
19 3 SVR poly No -0.057469
20 16 SVR poly Yes -0.059597
21 17 SVR sigmoid Yes -0.061965
22 14 SVR rbf Yes -0.066416
23 1 SVR rbf No -0.068284
24 4 SVR sigmoid No -0.069067
LotShape
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.891722
1 19 Random forest 100 Yes 0.879808
2 8 Random forest 300 No 0.878894
3 20 Random forest 200 Yes 0.876443
4 10 Random forest 500 No 0.875334
5 9 Random forest 400 No 0.875193
6 23 Random forest 500 Yes 0.873679
7 7 Random forest 200 No 0.873556
8 11 XGBoost 500 No 0.873500
9 24 XGBoost 500 Yes 0.873414
10 21 Random forest 300 Yes 0.872809
11 22 Random forest 400 Yes 0.872034
12 6 Random forest 100 No 0.871113
13 13 Linear General Yes 0.837916
14 0 Linear General No 0.837916
15 2 SVR linear No 0.835232
16 18 Decision tree General Yes 0.623780
17 5 Decision tree General No 0.606494
18 15 SVR linear Yes 0.002621
19 3 SVR poly No -0.075989
20 17 SVR sigmoid Yes -0.080684
21 16 SVR poly Yes -0.084496
22 14 SVR rbf Yes -0.086578
23 1 SVR rbf No -0.088749
24 4 SVR sigmoid No -0.089913
LandContour
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.874922
1 22 Random forest 400 Yes 0.861382
2 9 Random forest 400 No 0.861344
3 7 Random forest 200 No 0.860057
4 19 Random forest 100 Yes 0.859383
5 23 Random forest 500 Yes 0.859165
6 20 Random forest 200 Yes 0.858330
7 10 Random forest 500 No 0.857606
8 21 Random forest 300 Yes 0.857352
9 8 Random forest 300 No 0.856374
10 6 Random forest 100 No 0.850645
11 13 Linear General Yes 0.842168
12 0 Linear General No 0.842168
13 11 XGBoost 500 No 0.836461
14 24 XGBoost 500 Yes 0.835528
15 2 SVR linear No 0.777092
16 18 Decision tree General Yes 0.754325
17 5 Decision tree General No 0.747725
18 15 SVR linear Yes 0.003696
19 3 SVR poly No -0.066071
20 16 SVR poly Yes -0.066223
21 17 SVR sigmoid Yes -0.068656
22 14 SVR rbf Yes -0.073061
23 1 SVR rbf No -0.074816
24 4 SVR sigmoid No -0.075255
LotConfig
index Model Type Scaled Score
0 19 Random forest 100 Yes 0.875458
1 9 Random forest 400 No 0.873928
2 22 Random forest 400 Yes 0.873392
3 8 Random forest 300 No 0.872983
4 21 Random forest 300 Yes 0.872962
5 20 Random forest 200 Yes 0.872925
6 23 Random forest 500 Yes 0.871858
7 7 Random forest 200 No 0.871472
8 10 Random forest 500 No 0.870404
9 6 Random forest 100 No 0.867844
10 12 GradientBoosting 500 No 0.861294
11 11 XGBoost 500 No 0.853060
12 24 XGBoost 500 Yes 0.852355
13 13 Linear General Yes 0.813399
14 0 Linear General No 0.813399
15 2 SVR linear No 0.738740
16 5 Decision tree General No 0.733446
17 18 Decision tree General Yes 0.723337
18 15 SVR linear Yes 0.016272
19 17 SVR sigmoid Yes -0.054149
20 16 SVR poly Yes -0.055603
21 3 SVR poly No -0.056398
22 14 SVR rbf Yes -0.058240
23 1 SVR rbf No -0.060510
24 4 SVR sigmoid No -0.060629
LandSlope
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.896767
1 22 Random forest 400 Yes 0.890676
2 23 Random forest 500 Yes 0.890398
3 9 Random forest 400 No 0.890105
4 10 Random forest 500 No 0.888890
5 20 Random forest 200 Yes 0.888623
6 8 Random forest 300 No 0.888589
7 21 Random forest 300 Yes 0.888152
8 7 Random forest 200 No 0.887656
9 19 Random forest 100 Yes 0.887089
10 6 Random forest 100 No 0.885281
11 11 XGBoost 500 No 0.871312
12 24 XGBoost 500 Yes 0.870630
13 0 Linear General No 0.860911
14 13 Linear General Yes 0.860911
15 2 SVR linear No 0.783303
16 5 Decision tree General No 0.753450
17 18 Decision tree General Yes 0.740278
18 15 SVR linear Yes 0.062220
19 16 SVR poly Yes -0.010853
20 3 SVR poly No -0.013382
21 17 SVR sigmoid Yes -0.014037
22 14 SVR rbf Yes -0.019392
23 1 SVR rbf No -0.022006
24 4 SVR sigmoid No -0.022785
Neighborhood
index Model Type Scaled Score
0 24 XGBoost 500 Yes 0.849440
1 11 XGBoost 500 No 0.849422
2 12 GradientBoosting 500 No 0.844066
3 21 Random forest 300 Yes 0.835275
4 23 Random forest 500 Yes 0.833494
5 10 Random forest 500 No 0.832570
6 8 Random forest 300 No 0.831768
7 7 Random forest 200 No 0.831573
8 22 Random forest 400 Yes 0.831399
9 20 Random forest 200 Yes 0.831167
10 6 Random forest 100 No 0.830666
11 9 Random forest 400 No 0.830379
12 13 Linear General Yes 0.830272
13 0 Linear General No 0.830272
14 19 Random forest 100 Yes 0.823787
15 2 SVR linear No 0.737248
16 5 Decision tree General No 0.726832
17 18 Decision tree General Yes 0.703027
18 15 SVR linear Yes 0.029510
19 3 SVR poly No -0.061424
20 17 SVR sigmoid Yes -0.067250
21 16 SVR poly Yes -0.068363
22 14 SVR rbf Yes -0.068882
23 1 SVR rbf No -0.070046
24 4 SVR sigmoid No -0.070446
Condition1
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.877613
1 10 Random forest 500 No 0.872507
2 8 Random forest 300 No 0.871525
3 22 Random forest 400 Yes 0.871381
4 7 Random forest 200 No 0.871185
5 9 Random forest 400 No 0.871105
6 23 Random forest 500 Yes 0.870813
7 21 Random forest 300 Yes 0.870607
8 20 Random forest 200 Yes 0.870299
9 6 Random forest 100 No 0.869834
10 19 Random forest 100 Yes 0.868633
11 11 XGBoost 500 No 0.856586
12 24 XGBoost 500 Yes 0.856356
13 13 Linear General Yes 0.839798
14 0 Linear General No 0.839798
15 2 SVR linear No 0.743629
16 18 Decision tree General Yes 0.713406
17 5 Decision tree General No 0.697876
18 15 SVR linear Yes 0.044808
19 3 SVR poly No -0.027210
20 16 SVR poly Yes -0.027630
21 17 SVR sigmoid Yes -0.027701
22 14 SVR rbf Yes -0.029843
23 1 SVR rbf No -0.031934
24 4 SVR sigmoid No -0.032502
BldgType
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.880640
1 19 Random forest 100 Yes 0.866031
2 21 Random forest 300 Yes 0.864810
3 24 XGBoost 500 Yes 0.862924
4 10 Random forest 500 No 0.862619
5 11 XGBoost 500 No 0.862264
6 23 Random forest 500 Yes 0.862156
7 9 Random forest 400 No 0.861947
8 22 Random forest 400 Yes 0.860860
9 6 Random forest 100 No 0.860557
10 7 Random forest 200 No 0.860511
11 8 Random forest 300 No 0.859667
12 20 Random forest 200 Yes 0.858738
13 0 Linear General No 0.844409
14 13 Linear General Yes 0.844409
15 2 SVR linear No 0.757999
16 18 Decision tree General Yes 0.734163
17 5 Decision tree General No 0.725728
18 15 SVR linear Yes -0.015901
19 3 SVR poly No -0.088825
20 17 SVR sigmoid Yes -0.090645
21 16 SVR poly Yes -0.091669
22 14 SVR rbf Yes -0.094631
23 1 SVR rbf No -0.096780
24 4 SVR sigmoid No -0.097271
HouseStyle
index Model Type Scaled Score
0 6 Random forest 100 No 0.888045
1 21 Random forest 300 Yes 0.883706
2 20 Random forest 200 Yes 0.883188
3 22 Random forest 400 Yes 0.883027
4 24 XGBoost 500 Yes 0.881596
5 11 XGBoost 500 No 0.881596
6 10 Random forest 500 No 0.881368
7 8 Random forest 300 No 0.880668
8 23 Random forest 500 Yes 0.880643
9 12 GradientBoosting 500 No 0.880099
10 9 Random forest 400 No 0.879559
11 7 Random forest 200 No 0.879299
12 19 Random forest 100 Yes 0.877091
13 13 Linear General Yes 0.839040
14 0 Linear General No 0.839040
15 2 SVR linear No 0.780912
16 5 Decision tree General No 0.763005
17 18 Decision tree General Yes 0.760309
18 15 SVR linear Yes 0.083341
19 3 SVR poly No 0.003418
20 16 SVR poly Yes 0.002803
21 17 SVR sigmoid Yes 0.001152
22 14 SVR rbf Yes -0.002230
23 1 SVR rbf No -0.003894
24 4 SVR sigmoid No -0.004577
RoofStyle
index Model Type Scaled Score
0 22 Random forest 400 Yes 0.917282
1 21 Random forest 300 Yes 0.917157
2 10 Random forest 500 No 0.917067
3 23 Random forest 500 Yes 0.916133
4 9 Random forest 400 No 0.915917
5 8 Random forest 300 No 0.915767
6 7 Random forest 200 No 0.915183
7 12 GradientBoosting 500 No 0.913429
8 6 Random forest 100 No 0.913210
9 19 Random forest 100 Yes 0.912821
10 20 Random forest 200 Yes 0.911630
11 24 XGBoost 500 Yes 0.882381
12 11 XGBoost 500 No 0.882381
13 13 Linear General Yes 0.877343
14 0 Linear General No 0.877343
15 18 Decision tree General Yes 0.826687
16 5 Decision tree General No 0.795590
17 2 SVR linear No 0.792975
18 15 SVR linear Yes -0.005811
19 3 SVR poly No -0.076177
20 17 SVR sigmoid Yes -0.077058
21 16 SVR poly Yes -0.077155
22 14 SVR rbf Yes -0.080232
23 1 SVR rbf No -0.081900
24 4 SVR sigmoid No -0.082684
Exterior1st
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.878692
1 10 Random forest 500 No 0.872226
2 11 XGBoost 500 No 0.871323
3 24 XGBoost 500 Yes 0.870794
4 9 Random forest 400 No 0.870672
5 23 Random forest 500 Yes 0.869952
6 20 Random forest 200 Yes 0.869258
7 7 Random forest 200 No 0.869238
8 22 Random forest 400 Yes 0.869238
9 6 Random forest 100 No 0.867695
10 8 Random forest 300 No 0.867651
11 19 Random forest 100 Yes 0.866847
12 21 Random forest 300 Yes 0.865221
13 0 Linear General No 0.863392
14 13 Linear General Yes 0.863392
15 18 Decision tree General Yes 0.784450
16 2 SVR linear No 0.763103
17 5 Decision tree General No 0.744210
18 15 SVR linear Yes 0.073874
19 3 SVR poly No -0.010757
20 17 SVR sigmoid Yes -0.011273
21 14 SVR rbf Yes -0.013665
22 16 SVR poly Yes -0.014521
23 1 SVR rbf No -0.015216
24 4 SVR sigmoid No -0.015808
Exterior2nd
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.904375
1 19 Random forest 100 Yes 0.889119
2 6 Random forest 100 No 0.888267
3 10 Random forest 500 No 0.886317
4 9 Random forest 400 No 0.886284
5 11 XGBoost 500 No 0.885880
6 22 Random forest 400 Yes 0.885820
7 21 Random forest 300 Yes 0.885718
8 24 XGBoost 500 Yes 0.885633
9 7 Random forest 200 No 0.885194
10 8 Random forest 300 No 0.885139
11 23 Random forest 500 Yes 0.883096
12 20 Random forest 200 Yes 0.881926
13 13 Linear General Yes 0.872241
14 0 Linear General No 0.872241
15 2 SVR linear No 0.777708
16 18 Decision tree General Yes 0.764460
17 5 Decision tree General No 0.757623
18 15 SVR linear Yes 0.062420
19 3 SVR poly No -0.019929
20 17 SVR sigmoid Yes -0.022223
21 16 SVR poly Yes -0.022779
22 14 SVR rbf Yes -0.023878
23 1 SVR rbf No -0.025671
24 4 SVR sigmoid No -0.025747
MasVnrType
index Model Type Scaled Score
0 20 Random forest 200 Yes 0.878051
1 7 Random forest 200 No 0.877963
2 8 Random forest 300 No 0.877948
3 21 Random forest 300 Yes 0.877863
4 9 Random forest 400 No 0.877689
5 10 Random forest 500 No 0.876366
6 6 Random forest 100 No 0.876262
7 23 Random forest 500 Yes 0.876084
8 22 Random forest 400 Yes 0.875747
9 19 Random forest 100 Yes 0.874459
10 12 GradientBoosting 500 No 0.855375
11 11 XGBoost 500 No 0.843130
12 24 XGBoost 500 Yes 0.843012
13 18 Decision tree General Yes 0.825496
14 13 Linear General Yes 0.823654
15 0 Linear General No 0.823654
16 5 Decision tree General No 0.810839
17 2 SVR linear No 0.760954
18 15 SVR linear Yes 0.089291
19 16 SVR poly Yes 0.005424
20 3 SVR poly No 0.002521
21 17 SVR sigmoid Yes 0.001578
22 14 SVR rbf Yes -0.004367
23 1 SVR rbf No -0.005576
24 4 SVR sigmoid No -0.006610
ExterQual
index Model Type Scaled Score
0 11 XGBoost 500 No 0.835695
1 24 XGBoost 500 Yes 0.835556
2 12 GradientBoosting 500 No 0.834114
3 7 Random forest 200 No 0.830053
4 8 Random forest 300 No 0.829274
5 23 Random forest 500 Yes 0.828829
6 22 Random forest 400 Yes 0.827878
7 19 Random forest 100 Yes 0.826568
8 10 Random forest 500 No 0.825730
9 20 Random forest 200 Yes 0.824885
10 9 Random forest 400 No 0.824727
11 21 Random forest 300 Yes 0.823907
12 6 Random forest 100 No 0.822913
13 0 Linear General No 0.802465
14 13 Linear General Yes 0.802465
15 5 Decision tree General No 0.735165
16 18 Decision tree General Yes 0.718342
17 2 SVR linear No 0.704337
18 15 SVR linear Yes 0.018095
19 16 SVR poly Yes -0.062490
20 3 SVR poly No -0.065802
21 17 SVR sigmoid Yes -0.070305
22 14 SVR rbf Yes -0.076115
23 1 SVR rbf No -0.078362
24 4 SVR sigmoid No -0.078881
ExterCond
index Model Type Scaled Score
0 6 Random forest 100 No 0.881092
1 24 XGBoost 500 Yes 0.880946
2 11 XGBoost 500 No 0.880900
3 12 GradientBoosting 500 No 0.880733
4 8 Random forest 300 No 0.880712
5 22 Random forest 400 Yes 0.879889
6 9 Random forest 400 No 0.879802
7 10 Random forest 500 No 0.878828
8 19 Random forest 100 Yes 0.878343
9 21 Random forest 300 Yes 0.878297
10 23 Random forest 500 Yes 0.878219
11 20 Random forest 200 Yes 0.876516
12 7 Random forest 200 No 0.876494
13 13 Linear General Yes 0.862022
14 0 Linear General No 0.862022
15 2 SVR linear No 0.771948
16 18 Decision tree General Yes 0.760780
17 5 Decision tree General No 0.755209
18 15 SVR linear Yes 0.047162
19 17 SVR sigmoid Yes -0.022653
20 16 SVR poly Yes -0.022783
21 3 SVR poly No -0.024364
22 14 SVR rbf Yes -0.026400
23 1 SVR rbf No -0.028279
24 4 SVR sigmoid No -0.028765
Foundation
index Model Type Scaled Score
0 0 Linear General No 0.849045
1 13 Linear General Yes 0.849045
2 20 Random forest 200 Yes 0.846887
3 23 Random forest 500 Yes 0.846418
4 21 Random forest 300 Yes 0.846160
5 22 Random forest 400 Yes 0.846024
6 7 Random forest 200 No 0.845927
7 6 Random forest 100 No 0.845528
8 9 Random forest 400 No 0.845300
9 10 Random forest 500 No 0.844466
10 8 Random forest 300 No 0.843785
11 19 Random forest 100 Yes 0.843410
12 12 GradientBoosting 500 No 0.836681
13 11 XGBoost 500 No 0.798000
14 24 XGBoost 500 Yes 0.798000
15 18 Decision tree General Yes 0.758832
16 2 SVR linear No 0.750502
17 5 Decision tree General No 0.747045
18 15 SVR linear Yes 0.060981
19 3 SVR poly No -0.027173
20 17 SVR sigmoid Yes -0.027666
21 16 SVR poly Yes -0.031348
22 14 SVR rbf Yes -0.032207
23 1 SVR rbf No -0.035089
24 4 SVR sigmoid No -0.035961
BsmtQual
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.827117
1 6 Random forest 100 No 0.816104
2 8 Random forest 300 No 0.814444
3 23 Random forest 500 Yes 0.813529
4 21 Random forest 300 Yes 0.812891
5 20 Random forest 200 Yes 0.811707
6 10 Random forest 500 No 0.811065
7 7 Random forest 200 No 0.810634
8 22 Random forest 400 Yes 0.809940
9 19 Random forest 100 Yes 0.809389
10 9 Random forest 400 No 0.808783
11 24 XGBoost 500 Yes 0.806720
12 11 XGBoost 500 No 0.806116
13 0 Linear General No 0.792999
14 13 Linear General Yes 0.792999
15 2 SVR linear No 0.702707
16 5 Decision tree General No 0.692443
17 18 Decision tree General Yes 0.680052
18 15 SVR linear Yes 0.079125
19 3 SVR poly No -0.001742
20 16 SVR poly Yes -0.003684
21 17 SVR sigmoid Yes -0.004136
22 14 SVR rbf Yes -0.009408
23 1 SVR rbf No -0.011558
24 4 SVR sigmoid No -0.012275
BsmtCond
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.888053
1 9 Random forest 400 No 0.877506
2 23 Random forest 500 Yes 0.876779
3 7 Random forest 200 No 0.876482
4 20 Random forest 200 Yes 0.876338
5 6 Random forest 100 No 0.875852
6 8 Random forest 300 No 0.875693
7 22 Random forest 400 Yes 0.875688
8 21 Random forest 300 Yes 0.875608
9 10 Random forest 500 No 0.874551
10 19 Random forest 100 Yes 0.873167
11 11 XGBoost 500 No 0.853280
12 24 XGBoost 500 Yes 0.853274
13 13 Linear General Yes 0.850614
14 0 Linear General No 0.850614
15 18 Decision tree General Yes 0.785877
16 5 Decision tree General No 0.785451
17 2 SVR linear No 0.733598
18 15 SVR linear Yes 0.068632
19 16 SVR poly Yes 0.015189
20 17 SVR sigmoid Yes -0.000325
21 3 SVR poly No -0.002255
22 14 SVR rbf Yes -0.004007
23 1 SVR rbf No -0.005747
24 4 SVR sigmoid No -0.006345
BsmtExposure
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.902980
1 10 Random forest 500 No 0.897319
2 7 Random forest 200 No 0.897043
3 20 Random forest 200 Yes 0.896181
4 23 Random forest 500 Yes 0.896113
5 8 Random forest 300 No 0.896111
6 22 Random forest 400 Yes 0.895909
7 19 Random forest 100 Yes 0.895754
8 21 Random forest 300 Yes 0.894304
9 9 Random forest 400 No 0.893801
10 13 Linear General Yes 0.892651
11 0 Linear General No 0.892651
12 6 Random forest 100 No 0.889897
13 24 XGBoost 500 Yes 0.863054
14 11 XGBoost 500 No 0.862879
15 2 SVR linear No 0.802051
16 18 Decision tree General Yes 0.739442
17 5 Decision tree General No 0.705607
18 15 SVR linear Yes 0.075317
19 16 SVR poly Yes -0.001217
20 17 SVR sigmoid Yes -0.001902
21 3 SVR poly No -0.005340
22 14 SVR rbf Yes -0.006972
23 1 SVR rbf No -0.008634
24 4 SVR sigmoid No -0.009141
BsmtFinType1
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.900864
1 24 XGBoost 500 Yes 0.898907
2 11 XGBoost 500 No 0.898713
3 9 Random forest 400 No 0.895917
4 10 Random forest 500 No 0.895784
5 20 Random forest 200 Yes 0.895400
6 7 Random forest 200 No 0.895171
7 23 Random forest 500 Yes 0.894824
8 21 Random forest 300 Yes 0.894567
9 6 Random forest 100 No 0.893647
10 22 Random forest 400 Yes 0.893519
11 8 Random forest 300 No 0.893468
12 19 Random forest 100 Yes 0.888675
13 13 Linear General Yes 0.869595
14 0 Linear General No 0.869595
15 18 Decision tree General Yes 0.807494
16 5 Decision tree General No 0.800558
17 2 SVR linear No 0.785751
18 15 SVR linear Yes 0.015423
19 17 SVR sigmoid Yes -0.064263
20 16 SVR poly Yes -0.065476
21 3 SVR poly No -0.066118
22 14 SVR rbf Yes -0.068628
23 1 SVR rbf No -0.070145
24 4 SVR sigmoid No -0.070298
BsmtFinType2
index Model Type Scaled Score
0 20 Random forest 200 Yes 0.878983
1 23 Random forest 500 Yes 0.877156
2 10 Random forest 500 No 0.876851
3 7 Random forest 200 No 0.876070
4 19 Random forest 100 Yes 0.875809
5 6 Random forest 100 No 0.875694
6 21 Random forest 300 Yes 0.875590
7 8 Random forest 300 No 0.875359
8 22 Random forest 400 Yes 0.873973
9 9 Random forest 400 No 0.873446
10 12 GradientBoosting 500 No 0.861137
11 13 Linear General Yes 0.848239
12 0 Linear General No 0.848239
13 11 XGBoost 500 No 0.820005
14 24 XGBoost 500 Yes 0.819709
15 18 Decision tree General Yes 0.777325
16 5 Decision tree General No 0.765929
17 2 SVR linear No 0.742043
18 15 SVR linear Yes -0.039146
19 17 SVR sigmoid Yes -0.110826
20 16 SVR poly Yes -0.112371
21 3 SVR poly No -0.112694
22 14 SVR rbf Yes -0.114346
23 1 SVR rbf No -0.116030
24 4 SVR sigmoid No -0.116160
Heating
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.861000
1 22 Random forest 400 Yes 0.857389
2 21 Random forest 300 Yes 0.856824
3 8 Random forest 300 No 0.856270
4 23 Random forest 500 Yes 0.856221
5 9 Random forest 400 No 0.855851
6 7 Random forest 200 No 0.855773
7 20 Random forest 200 Yes 0.855502
8 10 Random forest 500 No 0.855185
9 6 Random forest 100 No 0.854403
10 19 Random forest 100 Yes 0.850813
11 24 XGBoost 500 Yes 0.834715
12 11 XGBoost 500 No 0.834636
13 13 Linear General Yes 0.811740
14 0 Linear General No 0.811740
15 5 Decision tree General No 0.771897
16 2 SVR linear No 0.750012
17 18 Decision tree General Yes 0.748476
18 15 SVR linear Yes 0.068385
19 16 SVR poly Yes 0.024317
20 3 SVR poly No 0.000235
21 17 SVR sigmoid Yes -0.000728
22 14 SVR rbf Yes -0.003436
23 1 SVR rbf No -0.005738
24 4 SVR sigmoid No -0.006352
HeatingQC
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.881603
1 10 Random forest 500 No 0.867339
2 22 Random forest 400 Yes 0.866126
3 8 Random forest 300 No 0.865605
4 23 Random forest 500 Yes 0.865514
5 20 Random forest 200 Yes 0.865489
6 21 Random forest 300 Yes 0.865376
7 19 Random forest 100 Yes 0.864051
8 9 Random forest 400 No 0.863855
9 6 Random forest 100 No 0.862734
10 7 Random forest 200 No 0.862182
11 13 Linear General Yes 0.859416
12 0 Linear General No 0.859416
13 11 XGBoost 500 No 0.852989
14 24 XGBoost 500 Yes 0.852989
15 5 Decision tree General No 0.766831
16 18 Decision tree General Yes 0.759584
17 2 SVR linear No 0.753130
18 15 SVR linear Yes 0.069117
19 17 SVR sigmoid Yes -0.008615
20 3 SVR poly No -0.008956
21 16 SVR poly Yes -0.009511
22 14 SVR rbf Yes -0.012912
23 1 SVR rbf No -0.014819
24 4 SVR sigmoid No -0.015240
CentralAir
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.868128
1 11 XGBoost 500 No 0.853363
2 24 XGBoost 500 Yes 0.852922
3 7 Random forest 200 No 0.840447
4 9 Random forest 400 No 0.840056
5 8 Random forest 300 No 0.837440
6 10 Random forest 500 No 0.837142
7 20 Random forest 200 Yes 0.835045
8 22 Random forest 400 Yes 0.833013
9 23 Random forest 500 Yes 0.831443
10 21 Random forest 300 Yes 0.830547
11 6 Random forest 100 No 0.829587
12 19 Random forest 100 Yes 0.826979
13 0 Linear General No 0.826851
14 13 Linear General Yes 0.826851
15 2 SVR linear No 0.758410
16 18 Decision tree General Yes 0.640120
17 5 Decision tree General No 0.638409
18 15 SVR linear Yes 0.044832
19 16 SVR poly Yes -0.021674
20 17 SVR sigmoid Yes -0.023000
21 3 SVR poly No -0.023894
22 14 SVR rbf Yes -0.028407
23 1 SVR rbf No -0.030385
24 4 SVR sigmoid No -0.030705
Electrical
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.887560
1 6 Random forest 100 No 0.885208
2 10 Random forest 500 No 0.885087
3 21 Random forest 300 Yes 0.883547
4 7 Random forest 200 No 0.883444
5 8 Random forest 300 No 0.882110
6 19 Random forest 100 Yes 0.881497
7 20 Random forest 200 Yes 0.881156
8 22 Random forest 400 Yes 0.880920
9 23 Random forest 500 Yes 0.880101
10 9 Random forest 400 No 0.879318
11 24 XGBoost 500 Yes 0.862897
12 11 XGBoost 500 No 0.862077
13 0 Linear General No 0.855961
14 13 Linear General Yes 0.855961
15 2 SVR linear No 0.809236
16 5 Decision tree General No 0.751300
17 18 Decision tree General Yes 0.730740
18 15 SVR linear Yes 0.048284
19 16 SVR poly Yes -0.025578
20 3 SVR poly No -0.027048
21 17 SVR sigmoid Yes -0.027384
22 14 SVR rbf Yes -0.031280
23 1 SVR rbf No -0.033750
24 4 SVR sigmoid No -0.034072
KitchenQual
index Model Type Scaled Score
0 8 Random forest 300 No 0.906510
1 7 Random forest 200 No 0.905521
2 10 Random forest 500 No 0.904247
3 22 Random forest 400 Yes 0.904206
4 20 Random forest 200 Yes 0.904186
5 21 Random forest 300 Yes 0.903768
6 23 Random forest 500 Yes 0.903744
7 9 Random forest 400 No 0.903095
8 19 Random forest 100 Yes 0.902536
9 12 GradientBoosting 500 No 0.900678
10 0 Linear General No 0.899370
11 13 Linear General Yes 0.899370
12 6 Random forest 100 No 0.899139
13 24 XGBoost 500 Yes 0.862255
14 11 XGBoost 500 No 0.862247
15 5 Decision tree General No 0.829435
16 18 Decision tree General Yes 0.821499
17 2 SVR linear No 0.758879
18 15 SVR linear Yes 0.080487
19 16 SVR poly Yes -0.000983
20 17 SVR sigmoid Yes -0.003928
21 3 SVR poly No -0.005987
22 14 SVR rbf Yes -0.009374
23 1 SVR rbf No -0.011348
24 4 SVR sigmoid No -0.012014
Functional
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.873090
1 20 Random forest 200 Yes 0.859215
2 8 Random forest 300 No 0.858622
3 22 Random forest 400 Yes 0.858539
4 23 Random forest 500 Yes 0.858201
5 10 Random forest 500 No 0.857739
6 21 Random forest 300 Yes 0.856705
7 9 Random forest 400 No 0.855987
8 19 Random forest 100 Yes 0.853319
9 6 Random forest 100 No 0.853105
10 7 Random forest 200 No 0.851258
11 13 Linear General Yes 0.829758
12 0 Linear General No 0.829758
13 11 XGBoost 500 No 0.821302
14 24 XGBoost 500 Yes 0.820822
15 2 SVR linear No 0.759719
16 5 Decision tree General No 0.692003
17 18 Decision tree General Yes 0.670412
18 15 SVR linear Yes 0.050117
19 3 SVR poly No -0.023536
20 17 SVR sigmoid Yes -0.024718
21 16 SVR poly Yes -0.026036
22 14 SVR rbf Yes -0.027881
23 1 SVR rbf No -0.030917
24 4 SVR sigmoid No -0.031602
GarageType
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.894650
1 19 Random forest 100 Yes 0.891764
2 22 Random forest 400 Yes 0.891499
3 7 Random forest 200 No 0.891411
4 20 Random forest 200 Yes 0.890873
5 10 Random forest 500 No 0.890175
6 9 Random forest 400 No 0.889611
7 23 Random forest 500 Yes 0.889493
8 21 Random forest 300 Yes 0.889296
9 8 Random forest 300 No 0.889091
10 6 Random forest 100 No 0.887331
11 24 XGBoost 500 Yes 0.880842
12 11 XGBoost 500 No 0.880708
13 0 Linear General No 0.879862
14 13 Linear General Yes 0.879862
15 5 Decision tree General No 0.821306
16 18 Decision tree General Yes 0.811382
17 2 SVR linear No 0.753089
18 15 SVR linear Yes 0.045239
19 16 SVR poly Yes -0.031705
20 17 SVR sigmoid Yes -0.033906
21 3 SVR poly No -0.033954
22 14 SVR rbf Yes -0.038037
23 1 SVR rbf No -0.039875
24 4 SVR sigmoid No -0.040479
GarageFinish
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.881197
1 23 Random forest 500 Yes 0.879520
2 8 Random forest 300 No 0.877456
3 19 Random forest 100 Yes 0.876995
4 21 Random forest 300 Yes 0.876815
5 10 Random forest 500 No 0.876561
6 20 Random forest 200 Yes 0.876315
7 22 Random forest 400 Yes 0.876024
8 9 Random forest 400 No 0.875965
9 6 Random forest 100 No 0.873321
10 7 Random forest 200 No 0.872225
11 11 XGBoost 500 No 0.837014
12 24 XGBoost 500 Yes 0.836182
13 13 Linear General Yes 0.828400
14 0 Linear General No 0.828400
15 2 SVR linear No 0.730333
16 5 Decision tree General No 0.707610
17 18 Decision tree General Yes 0.706319
18 15 SVR linear Yes 0.047019
19 16 SVR poly Yes -0.035920
20 17 SVR sigmoid Yes -0.039009
21 3 SVR poly No -0.041284
22 14 SVR rbf Yes -0.044828
23 1 SVR rbf No -0.046431
24 4 SVR sigmoid No -0.046784
GarageQual
index Model Type Scaled Score
0 9 Random forest 400 No 0.891153
1 22 Random forest 400 Yes 0.890950
2 23 Random forest 500 Yes 0.890524
3 10 Random forest 500 No 0.890127
4 8 Random forest 300 No 0.890090
5 20 Random forest 200 Yes 0.888516
6 21 Random forest 300 Yes 0.888499
7 6 Random forest 100 No 0.886710
8 7 Random forest 200 No 0.886635
9 19 Random forest 100 Yes 0.884801
10 12 GradientBoosting 500 No 0.873253
11 11 XGBoost 500 No 0.868108
12 24 XGBoost 500 Yes 0.866124
13 13 Linear General Yes 0.836670
14 0 Linear General No 0.836670
15 2 SVR linear No 0.793278
16 18 Decision tree General Yes 0.770181
17 5 Decision tree General No 0.744393
18 15 SVR linear Yes 0.077298
19 16 SVR poly Yes 0.019028
20 17 SVR sigmoid Yes -0.000418
21 3 SVR poly No -0.001045
22 14 SVR rbf Yes -0.004422
23 1 SVR rbf No -0.006595
24 4 SVR sigmoid No -0.007124
GarageCond
index Model Type Scaled Score
0 9 Random forest 400 No 0.917464
1 22 Random forest 400 Yes 0.916756
2 21 Random forest 300 Yes 0.916034
3 10 Random forest 500 No 0.914709
4 23 Random forest 500 Yes 0.914616
5 7 Random forest 200 No 0.914537
6 8 Random forest 300 No 0.913077
7 20 Random forest 200 Yes 0.912646
8 19 Random forest 100 Yes 0.911843
9 6 Random forest 100 No 0.910702
10 24 XGBoost 500 Yes 0.897989
11 11 XGBoost 500 No 0.897873
12 12 GradientBoosting 500 No 0.890377
13 13 Linear General Yes 0.879527
14 0 Linear General No 0.879527
15 2 SVR linear No 0.818277
16 18 Decision tree General Yes 0.788855
17 5 Decision tree General No 0.781484
18 15 SVR linear Yes -0.003163
19 16 SVR poly Yes -0.077867
20 17 SVR sigmoid Yes -0.079819
21 14 SVR rbf Yes -0.083742
22 3 SVR poly No -0.084278
23 1 SVR rbf No -0.086176
24 4 SVR sigmoid No -0.086274
PavedDrive
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.860488
1 8 Random forest 300 No 0.852663
2 19 Random forest 100 Yes 0.852147
3 20 Random forest 200 Yes 0.850365
4 21 Random forest 300 Yes 0.849727
5 10 Random forest 500 No 0.847865
6 7 Random forest 200 No 0.846546
7 22 Random forest 400 Yes 0.846492
8 9 Random forest 400 No 0.845747
9 23 Random forest 500 Yes 0.845595
10 6 Random forest 100 No 0.845216
11 24 XGBoost 500 Yes 0.817871
12 11 XGBoost 500 No 0.817861
13 0 Linear General No 0.808039
14 13 Linear General Yes 0.808039
15 2 SVR linear No 0.734634
16 5 Decision tree General No 0.669203
17 18 Decision tree General Yes 0.655986
18 15 SVR linear Yes 0.010600
19 3 SVR poly No -0.067882
20 16 SVR poly Yes -0.068408
21 17 SVR sigmoid Yes -0.068843
22 14 SVR rbf Yes -0.074601
23 1 SVR rbf No -0.076672
24 4 SVR sigmoid No -0.077905
SaleType
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.851101
1 0 Linear General No 0.835568
2 13 Linear General Yes 0.835568
3 11 XGBoost 500 No 0.834040
4 24 XGBoost 500 Yes 0.834033
5 20 Random forest 200 Yes 0.808783
6 8 Random forest 300 No 0.807135
7 7 Random forest 200 No 0.803108
8 9 Random forest 400 No 0.802811
9 21 Random forest 300 Yes 0.802305
10 10 Random forest 500 No 0.802192
11 23 Random forest 500 Yes 0.801975
12 22 Random forest 400 Yes 0.801856
13 6 Random forest 100 No 0.795919
14 19 Random forest 100 Yes 0.794208
15 2 SVR linear No 0.754550
16 5 Decision tree General No 0.674915
17 18 Decision tree General Yes 0.659214
18 15 SVR linear Yes 0.049666
19 17 SVR sigmoid Yes -0.029673
20 16 SVR poly Yes -0.031173
21 14 SVR rbf Yes -0.032008
22 3 SVR poly No -0.032536
23 1 SVR rbf No -0.034096
24 4 SVR sigmoid No -0.034493
SaleCondition
index Model Type Scaled Score
0 12 GradientBoosting 500 No 0.883722
1 11 XGBoost 500 No 0.883523
2 24 XGBoost 500 Yes 0.883461
3 19 Random forest 100 Yes 0.881301
4 6 Random forest 100 No 0.880026
5 10 Random forest 500 No 0.879773
6 21 Random forest 300 Yes 0.878811
7 8 Random forest 300 No 0.878571
8 23 Random forest 500 Yes 0.878438
9 7 Random forest 200 No 0.878378
10 20 Random forest 200 Yes 0.877348
11 22 Random forest 400 Yes 0.877320
12 9 Random forest 400 No 0.876314
13 13 Linear General Yes 0.870329
14 0 Linear General No 0.870329
15 2 SVR linear No 0.800120
16 5 Decision tree General No 0.782930
17 18 Decision tree General Yes 0.773632
18 15 SVR linear Yes 0.059703
19 16 SVR poly Yes -0.009548
20 3 SVR poly No -0.016037
21 17 SVR sigmoid Yes -0.017525
22 14 SVR rbf Yes -0.021135
23 1 SVR rbf No -0.022857
24 4 SVR sigmoid No -0.023283
After looping through 34 different categorical features, the following features may be relevant:
Let's construct a model containing those features
feat_selected=['SaleCondition','CentralAir']
num_col_selected=list(df14.columns)
col_selected = feat_selected+num_col_selected
df16=df10[col_selected]
df16_dum=pd.get_dummies(df16,drop_first=True)
col_16= list(df16_dum.columns)
y_16=df16_dum[col_16[0:1]]
x_16=df16_dum[col_16[1:]]
x_16_tr,x_16_ts,y_16_tr,y_16_ts=train_test_split(x_16,y_16,train_size=0.9)
m_mlinear_regression(x_16_tr,x_16_ts,y_16_tr,y_16_ts)
| index | Model | Type | Scaled | Score | |
|---|---|---|---|---|---|
| 0 | 7 | Random forest | 200 | No | 0.874724 |
| 1 | 6 | Random forest | 100 | No | 0.874039 |
| 2 | 23 | Random forest | 500 | Yes | 0.871837 |
| 3 | 20 | Random forest | 200 | Yes | 0.870894 |
| 4 | 22 | Random forest | 400 | Yes | 0.870174 |
| 5 | 9 | Random forest | 400 | No | 0.869217 |
| 6 | 21 | Random forest | 300 | Yes | 0.869215 |
| 7 | 10 | Random forest | 500 | No | 0.868835 |
| 8 | 19 | Random forest | 100 | Yes | 0.868001 |
| 9 | 8 | Random forest | 300 | No | 0.867172 |
| 10 | 12 | GradientBoosting | 500 | No | 0.861157 |
| 11 | 24 | XGBoost | 500 | Yes | 0.847065 |
| 12 | 11 | XGBoost | 500 | No | 0.846708 |
| 13 | 13 | Linear | General | Yes | 0.845372 |
| 14 | 0 | Linear | General | No | 0.845372 |
| 15 | 18 | Decision tree | General | Yes | 0.770686 |
| 16 | 5 | Decision tree | General | No | 0.724293 |
| 17 | 2 | SVR | linear | No | 0.696053 |
| 18 | 15 | SVR | linear | Yes | 0.052488 |
| 19 | 16 | SVR | poly | Yes | -0.021600 |
| 20 | 3 | SVR | poly | No | -0.025731 |
| 21 | 17 | SVR | sigmoid | Yes | -0.028781 |
| 22 | 14 | SVR | rbf | Yes | -0.032657 |
| 23 | 1 | SVR | rbf | No | -0.034487 |
| 24 | 4 | SVR | sigmoid | No | -0.035184 |
mmmm, not big improvement yet
sc = StandardScaler()
xtr_sc = sc.fit_transform(x_16_tr)
xts_sc = sc.transform(x_16_ts)
model_1 = RandomForestRegressor(n_estimators=100)
model_2 = GradientBoostingRegressor()
cv = cross_val_score(estimator=model_1,X=xtr_sc,y=y_16_tr,cv=10)
cv.mean(),cv.std(),cv.max()
(0.8663919151553108, 0.03069149896392418, 0.9153664902236791)
cv = cross_val_score(estimator=model_2,X=xtr_sc,y=y_16_tr,cv=10)
cv.mean(),cv.std(),cv.max()
(0.8808518587877412, 0.023072393400485343, 0.9249739413136858)
So my best model so far is GradientBoostingRegressor
parameters = {'n_estimators':[100,200,300]}
gsearch = GridSearchCV(estimator=model_2,param_grid=parameters)
gsearch.fit(xtr_sc,y_16_tr).best_params_
{'n_estimators': 100}
our best model so far is random forest with n_estimators=100